NMT.R
In tfaddons: Interface to 'TensorFlow SIG Addons'

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----eval=F-------------------------------------------------------------------
#  
#  library(tensorflow)
#  library(keras)
#  library(data.table)
#  library(tfdatasets)
#  library(tfaddons)
#  
#  # Preprocessing -----------------------------------------------------------
#  
#  # Assumes you've downloaded and unzipped one of the bilingual datasets offered at
#  # http://www.manythings.org/anki/ and put it into a directory "data"
#  # This example translates English to Dutch.
#  download_data = function(){
#    if(!dir.exists('data')) {
#      dir.create('data')
#    }
#    if(!file.exists('data/nld-eng.zip')) {
#      download.file('http://www.manythings.org/anki/nld-eng.zip',
#                    destfile = file.path("data", basename('nld-eng.zip')))
#      unzip('data/nld-eng.zip', exdir = 'data')
#    }
#  }
#  
#  download_data()
#  
#  filepath <- file.path("data", "nld.txt")
#  
#  df = data.table::fread(filepath, header = FALSE, encoding = 'UTF-8',
#                         select = c(1,2), nrows = -1)
#  
#  text_cleaner <- function(text){
#    text = text %>%
#      # replace non ascii
#      textclean::replace_non_ascii() %>%
#      # remove all non relevant symbols (letters, spaces, and apostrophes are retained)
#      textclean::strip(apostrophe.remove = TRUE) %>%
#      paste('<start> ', ., ' <end>')
#  }
#  
#  df = sapply(1:2, function(x) text_cleaner(df[[x]])) %>% as.data.table()
#  
#  text_tok <- function(text) {
#    tokenizer = text_tokenizer(filters='')
#    tokenizer %>% fit_text_tokenizer(text)
#    vocab_size = tokenizer$word_index
#    data = tokenizer %>%
#      texts_to_sequences(text) %>%
#      pad_sequences(padding='post')
#    list(vocab_size,data,tokenizer)
#  }
#  
#  c(input_vocab_size, data_en, tokenizer_en) %<-% c(df[['V1']] %>% text_tok())
#  
#  c(output_vocab_size, data_de, tokenizer_de) %<-% c(df[['V2']] %>% text_tok())
#  
#  
#  # Split the dataset
#  indices_to_take = sample.int(n = nrow(df), size = floor(0.8*nrow(df)), replace = FALSE)
#  
#  split_data <- function(data) {
#    c(train, test) %<-% list(data[indices_to_take, ], data[-indices_to_take, ] )
#    list(train, test)
#  }
#  
#  
#  c(en_train, en_test, de_train, de_test) %<-% c(split_data(data_en), split_data(data_de))
#  
#  rm(df, filepath, indices_to_take, download_data, split_data, text_cleaner, text_tok)
#  
#  batch_size = 64L
#  buffer_size = nrow(en_train)
#  steps_per_epoch = buffer_size  %/% batch_size
#  embedding_dims = 256L
#  rnn_units = 1024L
#  dense_units = 1024L
#  dtype = tf$float32   #used to initialize DecoderCell Zero state
#  
#  
#  dataset = tensor_slices_dataset(list(en_train, de_train)) %>%
#    dataset_shuffle(buffer_size) %>% dataset_batch(batch_size, drop_remainder = TRUE)
#  
#  
#  EncoderNetwork = reticulate::PyClass(
#    'EncoderNetwork',
#    inherit = tf$keras$Model,
#    defs = list(
#  
#      `__init__` = function(self, input_vocab_size, embedding_dims, rnn_units) {
#  
#        super()$`__init__`()
#  
#        self$encoder_embedding = layer_embedding(input_dim = length(input_vocab_size),
#                                                 output_dim = embedding_dims)
#        self$encoder_rnnlayer = layer_lstm(units = rnn_units, return_sequences = TRUE,
#                                           return_state = TRUE)
#        NULL
#      }
#    )
#  )
#  
#  
#  
#  DecoderNetwork = reticulate::PyClass(
#    'DecoderNetwork',
#    inherit = tf$keras$Model,
#    defs = list(
#  
#      `__init__` = function(self, output_vocab_size, embedding_dims, rnn_units) {
#  
#        super()$`__init__`()
#        self$decoder_embedding = layer_embedding(input_dim = length(output_vocab_size),
#                                                 output_dim = embedding_dims)
#        self$dense_layer = layer_dense(units = length(output_vocab_size))
#        self$decoder_rnncell = tf$keras$layers$LSTMCell(rnn_units)
#        # Sampler
#        self$sampler = sampler_training()
#        # Create attention mechanism with memory = NULL
#        self$attention_mechanism = self$build_attention_mechanism(dense_units, NULL, c(rep(ncol(data_en), batch_size)))
#        self$rnn_cell =  self$build_rnn_cell(batch_size)
#        self$decoder = decoder_basic(cell=self$rnn_cell, sampler = self$sampler,
#                                     output_layer = self$dense_layer)
#        NULL
#      },
#  
#  
#  
#      build_attention_mechanism = function(self, units, memory, memory_sequence_length) {
#        attention_luong(units = units , memory = memory,
#                        memory_sequence_length = memory_sequence_length)
#      },
#  
#      build_rnn_cell = function(self, batch_size) {
#        rnn_cell = attention_wrapper(cell = self$decoder_rnncell,
#                                     attention_mechanism = self$attention_mechanism,
#                                     attention_layer_size = dense_units)
#        rnn_cell
#      },
#  
#      build_decoder_initial_state = function(self, batch_size, encoder_state, dtype) {
#        decoder_initial_state = self$rnn_cell$get_initial_state(batch_size = batch_size,
#                                                                dtype = dtype)
#        decoder_initial_state = decoder_initial_state$clone(cell_state = encoder_state)
#        decoder_initial_state
#      }
#    )
#  )
#  
#  encoderNetwork = EncoderNetwork(input_vocab_size, embedding_dims, rnn_units)
#  decoderNetwork = DecoderNetwork(output_vocab_size, embedding_dims, rnn_units)
#  optimizer = tf$keras$optimizers$Adam()
#  
#  
#  
#  loss_function <- function(y_pred, y) {
#    #shape of y [batch_size, ty]
#    #shape of y_pred [batch_size, Ty, output_vocab_size]
#    loss = keras::loss_sparse_categorical_crossentropy(y, y_pred)
#    mask = tf$logical_not(tf$math$equal(y,0L))   #output 0 for y=0 else output 1
#    mask = tf$cast(mask, dtype=loss$dtype)
#    loss = mask * loss
#    loss = tf$reduce_mean(loss)
#    loss
#  }
#  
#  train_step <- function(input_batch, output_batch,encoder_initial_cell_state) {
#    loss = 0L
#  
#    with(tf$GradientTape() %as% tape, {
#      encoder_emb_inp = encoderNetwork$encoder_embedding(input_batch)
#      c(a, a_tx, c_tx) %<-% encoderNetwork$encoder_rnnlayer(encoder_emb_inp,
#                                                            initial_state = encoder_initial_cell_state)
#  
#      #[last step activations,last memory_state] of encoder passed as input to decoder Network
#      # Prepare correct Decoder input & output sequence data
#      decoder_input = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,1:45]) # ignore <end>
#      #compare logits with timestepped +1 version of decoder_input
#      decoder_output = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,2:46]) #ignore <start>
#  
#      # Decoder Embeddings
#      decoder_emb_inp = decoderNetwork$decoder_embedding(decoder_input)
#  
#      #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
#      decoderNetwork$attention_mechanism$setup_memory(a)
#      decoder_initial_state = decoderNetwork$build_decoder_initial_state(batch_size,
#                                                                         encoder_state = list(a_tx, c_tx),
#                                                                         dtype = tf$float32)
#      #BasicDecoderOutput
#      c(outputs, res1, res2) %<-% decoderNetwork$decoder(decoder_emb_inp,initial_state = decoder_initial_state,
#                                                         sequence_length = c(rep(ncol(data_en) - 1L, batch_size)))
#  
#      logits = outputs$rnn_output
#      #Calculate loss
#  
#      loss = loss_function(logits, decoder_output)
#  
#    })
#    #Returns the list of all layer variables / weights.
#    variables = c(encoderNetwork$trainable_variables, decoderNetwork$trainable_variables)
#    # differentiate loss wrt variables
#    gradients = tape$gradient(loss, variables)
#    #grads_and_vars – List of(gradient, variable) pairs.
#    grads_and_vars = purrr::transpose(list(gradients,variables))
#    optimizer$apply_gradients(grads_and_vars)
#    loss
#  }
#  
#  initialize_initial_state = function() {
#    list(tf$zeros(c(batch_size, rnn_units)), tf$zeros(c(batch_size, rnn_units)))
#  }
#  
#  
#  epochs = 1
#  
#  
#  for (i in 1:sum(epochs + 1)) {
#    encoder_initial_cell_state = initialize_initial_state()
#    total_loss = 0.0
#    res = dataset %>% dataset_take(steps_per_epoch) %>% iterate()
#    for (batch in 1:length(res)) {
#      c(input_batch, output_batch) %<-% res[[batch]]
#      batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
#      total_loss = total_loss + batch_loss
#      if((batch+1) %% 5 == 0) {
#        print(paste('total loss:', batch_loss$numpy(), 'epoch', i, 'batch',batch+1))
#      }
#    }
#  
#  }
#