keras: R Interface to 'Keras'

#' Trains a memory network on the bAbI dataset.
#' 
#' References:
#' 
#' - Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush,
#'   "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks",
#'   http://arxiv.org/abs/1502.05698
#' 
#' - Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus,
#'   "End-To-End Memory Networks", http://arxiv.org/abs/1503.08895
#' 
#' Reaches 98.6% accuracy on task 'single_supporting_fact_10k' after 120 epochs.
#' Time per epoch: 3s on CPU (core i7).
#' 

library(keras)
library(readr)
library(stringr)
library(purrr)
library(tibble)
library(dplyr)

# Function definition -----------------------------------------------------

tokenize_words <- function(x){
  x <- x %>% 
    str_replace_all('([[:punct:]]+)', ' \\1') %>% 
    str_split(' ') %>%
    unlist()
  x[x != ""]
}

parse_stories <- function(lines, only_supporting = FALSE){
  lines <- lines %>% 
    str_split(" ", n = 2) %>%
    map_df(~tibble(nid = as.integer(.x[[1]]), line = .x[[2]]))
  
  lines <- lines %>%
    mutate(
      split = map(line, ~str_split(.x, "\t")[[1]]),
      q = map_chr(split, ~.x[1]),
      a = map_chr(split, ~.x[2]),
      supporting = map(split, ~.x[3] %>% str_split(" ") %>% unlist() %>% as.integer()),
      story_id = c(0, cumsum(nid[-nrow(.)] > nid[-1]))
    ) %>%
    select(-split)
  
  stories <- lines %>%
    filter(is.na(a)) %>%
    select(nid_story = nid, story_id, story = q)
  
  questions <- lines %>%
    filter(!is.na(a)) %>%
    select(-line) %>%
    left_join(stories, by = "story_id") %>%
    filter(nid_story < nid)
  
  if(only_supporting){
    questions <- questions %>%
      filter(map2_lgl(nid_story, supporting, ~.x %in% .y))
  }
  
  questions %>%
    group_by(story_id, nid, question = q, answer = a) %>%
    summarise(story = paste(story, collapse = " ")) %>%
    ungroup() %>% 
    mutate(
      question = map(question, ~tokenize_words(.x)),
      story = map(story, ~tokenize_words(.x)),
      id = row_number()
    ) %>%
    select(id, question, answer, story)
}

vectorize_stories <- function(data, vocab, story_maxlen, query_maxlen){
  
  questions <- map(data$question, function(x){
    map_int(x, ~which(.x == vocab))
  })
  
  stories <- map(data$story, function(x){
    map_int(x, ~which(.x == vocab))
  })
  
  # "" represents padding
  answers <- sapply(c("", vocab), function(x){
    as.integer(x == data$answer)
  })
  
  list(
    questions = pad_sequences(questions, maxlen = query_maxlen),
    stories   = pad_sequences(stories, maxlen = story_maxlen),
    answers   = answers
  )
}


# Parameters --------------------------------------------------------------

challenges <- list(
  # QA1 with 10,000 samples
  single_supporting_fact_10k = "%stasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_%s.txt",
  # QA2 with 10,000 samples
  two_supporting_facts_10k = "%stasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_%s.txt"
)

challenge_type <- "single_supporting_fact_10k"
challenge <- challenges[[challenge_type]]
max_length <- 999999


# Data Preparation --------------------------------------------------------

# Download data
path <- get_file(
  fname = "babi-tasks-v1-2.tar.gz",
  origin = "https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz"
)
untar(path, exdir = str_replace(path, fixed(".tar.gz"), "/"))
path <- str_replace(path, fixed(".tar.gz"), "/")

# Reading training and test data
train <- read_lines(sprintf(challenge, path, "train")) %>%
  parse_stories() %>%
  filter(map_int(story, ~length(.x)) <= max_length)

test <- read_lines(sprintf(challenge, path, "test")) %>%
  parse_stories() %>%
  filter(map_int(story, ~length(.x)) <= max_length)

# Extract the vocabulary
all_data <- bind_rows(train, test)
vocab <- c(unlist(all_data$question), all_data$answer, 
           unlist(all_data$story)) %>%
  unique() %>%
  sort()

# Reserve 0 for masking via pad_sequences
vocab_size <- length(vocab) + 1
story_maxlen <- map_int(all_data$story, ~length(.x)) %>% max()
query_maxlen <- map_int(all_data$question, ~length(.x)) %>% max()

# Vectorized versions of training and test sets
train_vec <- vectorize_stories(train, vocab, story_maxlen, query_maxlen)
test_vec <- vectorize_stories(test, vocab, story_maxlen, query_maxlen)


# Defining the model ------------------------------------------------------

# Placeholders
sequence <- layer_input(shape = c(story_maxlen))
question <- layer_input(shape = c(query_maxlen))

# Encoders
# Embed the input sequence into a sequence of vectors
sequence_encoder_m <- keras_model_sequential()
sequence_encoder_m %>%
  layer_embedding(input_dim = vocab_size, output_dim = 64) %>%
  layer_dropout(rate = 0.3)
# output: (samples, story_maxlen, embedding_dim)

# Embed the input into a sequence of vectors of size query_maxlen
sequence_encoder_c <- keras_model_sequential()
sequence_encoder_c %>%
  layer_embedding(input_dim = vocab_size, output = query_maxlen) %>%
  layer_dropout(rate = 0.3)
# output: (samples, story_maxlen, query_maxlen)

# Embed the question into a sequence of vectors
question_encoder <- keras_model_sequential()
question_encoder %>%
  layer_embedding(input_dim = vocab_size, output_dim = 64, 
                  input_length = query_maxlen) %>%
  layer_dropout(rate = 0.3)
# output: (samples, query_maxlen, embedding_dim)

# Encode input sequence and questions (which are indices)
# to sequences of dense vectors
sequence_encoded_m <- sequence_encoder_m(sequence)
sequence_encoded_c <- sequence_encoder_c(sequence)
question_encoded <- question_encoder(question)

# Compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, query_maxlen)`
match <- list(sequence_encoded_m, question_encoded) %>%
  layer_dot(axes = c(2,2)) %>%
  layer_activation("softmax")

# Add the match matrix with the second input vector sequence
response <- list(match, sequence_encoded_c) %>%
  layer_add() %>%
  layer_permute(c(2,1))

# Concatenate the match matrix with the question vector sequence
answer <- list(response, question_encoded) %>%
  layer_concatenate() %>%
  # The original paper uses a matrix multiplication for this reduction step.
  # We choose to use an RNN instead.
  layer_lstm(32) %>%
  # One regularization layer -- more would probably be needed.
  layer_dropout(rate = 0.3) %>%
  layer_dense(vocab_size) %>%
  # We output a probability distribution over the vocabulary
  layer_activation("softmax")

# Build the final model
model <- keras_model(inputs = list(sequence, question), answer)
model %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = "accuracy"
)


# Training ----------------------------------------------------------------

model %>% fit(
  x = list(train_vec$stories, train_vec$questions),
  y = train_vec$answers,
  batch_size = 32,
  epochs = 120,
  validation_data = list(list(test_vec$stories, test_vec$questions), test_vec$answers)
)