#' Trains two recurrent neural networks based upon a story and a question.
#' The resulting merged vector is then queried to answer a range of bAbI tasks.
#'
#' The results are comparable to those for an LSTM model provided in Weston et al.:
#' "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks"
#' http://arxiv.org/abs/1502.05698
#'
#' Task Number | FB LSTM Baseline | Keras QA
#' --- | --- | ---
#' QA1 - Single Supporting Fact | 50 | 100.0
#' QA2 - Two Supporting Facts | 20 | 50.0
#' QA3 - Three Supporting Facts | 20 | 20.5
#' QA4 - Two Arg. Relations | 61 | 62.9
#' QA5 - Three Arg. Relations | 70 | 61.9
#' QA6 - yes/No Questions | 48 | 50.7
#' QA7 - Counting | 49 | 78.9
#' QA8 - Lists/Sets | 45 | 77.2
#' QA9 - Simple Negation | 64 | 64.0
#' QA10 - Indefinite Knowledge | 44 | 47.7
#' QA11 - Basic Coreference | 72 | 74.9
#' QA12 - Conjunction | 74 | 76.4
#' QA13 - Compound Coreference | 94 | 94.4
#' QA14 - Time Reasoning | 27 | 34.8
#' QA15 - Basic Deduction | 21 | 32.4
#' QA16 - Basic Induction | 23 | 50.6
#' QA17 - Positional Reasoning | 51 | 49.1
#' QA18 - Size Reasoning | 52 | 90.8
#' QA19 - Path Finding | 8 | 9.0
#' QA20 - Agent's Motivations | 91 | 90.7
#'
#' For the resources related to the bAbI project, refer to:
#' https://research.facebook.com/researchers/1543934539189348
#'
#' Notes:
#'
#' - With default word, sentence, and query vector sizes, the GRU model achieves:
#' - 100% test accuracy on QA1 in 20 epochs (2 seconds per epoch on CPU)
#' - 50% test accuracy on QA2 in 20 epochs (16 seconds per epoch on CPU)
#' In comparison, the Facebook paper achieves 50% and 20% for the LSTM baseline.
#'
#' - The task does not traditionally parse the question separately. This likely
#' improves accuracy and is a good example of merging two RNNs.
#'
#' - The word vector embeddings are not shared between the story and question RNNs.
#'
#' - See how the accuracy changes given 10,000 training samples (en-10k) instead
#' of only 1000. 1000 was used in order to be comparable to the original paper.
#'
#' - Experiment with GRU, LSTM, and JZS1-3 as they give subtly different results.
#'
#' - The length and noise (i.e. 'useless' story components) impact the ability for
#' LSTMs / GRUs to provide the correct answer. Given only the supporting facts,
#' these RNNs can achieve 100% accuracy on many tasks. Memory networks and neural
#' networks that use attentional processes can efficiently search through this
#' noise to find the relevant statements, improving performance substantially.
#' This becomes especially obvious on QA2 and QA3, both far longer than QA1.
#'
library(keras)
library(readr)
library(stringr)
library(purrr)
library(tibble)
library(dplyr)
# Function definition -----------------------------------------------------
tokenize_words <- function(x){
x <- x %>%
str_replace_all('([[:punct:]]+)', ' \\1') %>%
str_split(' ') %>%
unlist()
x[x != ""]
}
parse_stories <- function(lines, only_supporting = FALSE){
lines <- lines %>%
str_split(" ", n = 2) %>%
map_df(~tibble(nid = as.integer(.x[[1]]), line = .x[[2]]))
lines <- lines %>%
mutate(
split = map(line, ~str_split(.x, "\t")[[1]]),
q = map_chr(split, ~.x[1]),
a = map_chr(split, ~.x[2]),
supporting = map(split, ~.x[3] %>% str_split(" ") %>% unlist() %>% as.integer()),
story_id = c(0, cumsum(nid[-nrow(.)] > nid[-1]))
) %>%
select(-split)
stories <- lines %>%
filter(is.na(a)) %>%
select(nid_story = nid, story_id, story = q)
questions <- lines %>%
filter(!is.na(a)) %>%
select(-line) %>%
left_join(stories, by = "story_id") %>%
filter(nid_story < nid)
if(only_supporting){
questions <- questions %>%
filter(map2_lgl(nid_story, supporting, ~.x %in% .y))
}
questions %>%
group_by(story_id, nid, question = q, answer = a) %>%
summarise(story = paste(story, collapse = " ")) %>%
ungroup() %>%
mutate(
question = map(question, ~tokenize_words(.x)),
story = map(story, ~tokenize_words(.x)),
id = row_number()
) %>%
select(id, question, answer, story)
}
vectorize_stories <- function(data, vocab, story_maxlen, query_maxlen){
questions <- map(data$question, function(x){
map_int(x, ~which(.x == vocab))
})
stories <- map(data$story, function(x){
map_int(x, ~which(.x == vocab))
})
# "" represents padding
answers <- sapply(c("", vocab), function(x){
as.integer(x == data$answer)
})
list(
questions = pad_sequences(questions, maxlen = query_maxlen),
stories = pad_sequences(stories, maxlen = story_maxlen),
answers = answers
)
}
# Parameters --------------------------------------------------------------
max_length <- 99999
embed_hidden_size <- 50
batch_size <- 32
epochs <- 40
# Data Preparation --------------------------------------------------------
path <- get_file(
fname = "babi-tasks-v1-2.tar.gz",
origin = "https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz"
)
untar(path, exdir = str_replace(path, fixed(".tar.gz"), "/"))
path <- str_replace(path, fixed(".tar.gz"), "/")
# Default QA1 with 1000 samples
# challenge = '%stasks_1-20_v1-2/en/qa1_single-supporting-fact_%s.txt'
# QA1 with 10,000 samples
# challenge = '%stasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_%s.txt'
# QA2 with 1000 samples
challenge <- "%stasks_1-20_v1-2/en/qa2_two-supporting-facts_%s.txt"
# QA2 with 10,000 samples
# challenge = '%stasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_%s.txt'
train <- read_lines(sprintf(challenge, path, "train")) %>%
parse_stories() %>%
filter(map_int(story, ~length(.x)) <= max_length)
test <- read_lines(sprintf(challenge, path, "test")) %>%
parse_stories() %>%
filter(map_int(story, ~length(.x)) <= max_length)
# extract the vocabulary
all_data <- bind_rows(train, test)
vocab <- c(unlist(all_data$question), all_data$answer,
unlist(all_data$story)) %>%
unique() %>%
sort()
# Reserve 0 for masking via pad_sequences
vocab_size <- length(vocab) + 1
story_maxlen <- map_int(all_data$story, ~length(.x)) %>% max()
query_maxlen <- map_int(all_data$question, ~length(.x)) %>% max()
# vectorized versions of training and test sets
train_vec <- vectorize_stories(train, vocab, story_maxlen, query_maxlen)
test_vec <- vectorize_stories(test, vocab, story_maxlen, query_maxlen)
# Defining the model ------------------------------------------------------
sentence <- layer_input(shape = c(story_maxlen), dtype = "int32")
encoded_sentence <- sentence %>%
layer_embedding(input_dim = vocab_size, output_dim = embed_hidden_size) %>%
layer_dropout(rate = 0.3)
question <- layer_input(shape = c(query_maxlen), dtype = "int32")
encoded_question <- question %>%
layer_embedding(input_dim = vocab_size, output_dim = embed_hidden_size) %>%
layer_dropout(rate = 0.3) %>%
layer_lstm(units = embed_hidden_size) %>%
layer_repeat_vector(n = story_maxlen)
merged <- list(encoded_sentence, encoded_question) %>%
layer_add() %>%
layer_lstm(units = embed_hidden_size) %>%
layer_dropout(rate = 0.3)
preds <- merged %>%
layer_dense(units = vocab_size, activation = "softmax")
model <- keras_model(inputs = list(sentence, question), outputs = preds)
model %>% compile(
optimizer = "adam",
loss = "categorical_crossentropy",
metrics = "accuracy"
)
model
# Training ----------------------------------------------------------------
model %>% fit(
x = list(train_vec$stories, train_vec$questions),
y = train_vec$answers,
batch_size = batch_size,
epochs = epochs,
validation_split=0.05
)
evaluation <- model %>% evaluate(
x = list(test_vec$stories, test_vec$questions),
y = test_vec$answers,
batch_size = batch_size
)
evaluation
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.