RBERT: R Implementation of BERT

Documented in .convert_examples_to_features_EF .convert_single_example_EF extract_features .get_actual_index InputExample_EF .InputFeatures_EF input_fn_builder_EF make_examples_simple .model_fn_builder_EF

# Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# class InputExample_EF ---------------------------------------------------

#' Construct objects of class \code{InputExample_EF}
#'
#' An InputExample_EF is a single test example for feature extraction. Note that
#' this class is similiar to the InputExample class used for simple sequence
#' classification, but doesn't have a label property. The name of the id
#' property is also annoyingly different; should eventually standardize better
#' than the Python folks did. (RBERT issue #28.)
#'
#' @param unique_id Integer or character; a unique id for this example.
#' @param text_a Character; the untokenized text of the first sequence.
#' @param text_b (Optional) Character; the untokenized text of the second
#'   sequence.
#'
#' @return An object of class \code{InputExample_EF}.
#' @export
#'
#' @examples
#' input_ex <- InputExample_EF(
#'   unique_id = 1,
#'   text_a = "I work at the bank."
#' )
InputExample_EF <- function(unique_id,
                            text_a,
                            text_b = NULL) {
  obj <- list(
    "unique_id" = unique_id,
    "text_a" = text_a,
    "text_b" = text_b
  )
  class(obj) <- "InputExample_EF"
  return(obj)
}
# class InputFeatures_EF --------------------------------------------------

#' Construct objects of class \code{InputFeatures_FE}
#'
#' An InputFeatures object is a single set of (input) features of data used for
#' (output) feature extraction. Note that this class is similiar to the
#' InputFeatures class used for simple sequence classification, with annoying
#' differences. Will eventually standardize; till then, check parameter names.
#' (RBERT issue #28.)
#'
#' @param unique_id Integer or character; a unique id for this example.
#' @param tokens Character vector; the actual tokens in this example.
#' @param input_ids Integer vector; the sequence of token ids in this example.
#' @param input_mask Integer vector; sequence of 1s (for "real" tokens) and 0s
#'   (for padding tokens).
#' @param input_type_ids Integer vector; aka token_type_ids. Indicators for
#'   which sentence (or sequence) each token belongs to. Classical BERT supports
#'   only 0s and 1s (for first and second sentence, respectively).
#'
#' @return An object of class \code{InputFeatures_FE}.
#' @keywords internal
.InputFeatures_EF <- function(unique_id,
                              tokens,
                              input_ids,
                              input_mask,
                              input_type_ids) {
  obj <- list(
    "unique_id" = unique_id,
    "tokens" = tokens,
    "input_ids" = input_ids,
    "input_mask" = input_mask,
    "input_type_ids" = input_type_ids
  )
  class(obj) <- "InputFeatures"
  return(obj)
}

# input_fn_builder_EF -----------------------------------------------------


#' Create an \code{input_fn} closure to be passed to TPUEstimator
#'
#' Creates an \code{input_fn} closure to be passed to TPUEstimator. The output
#' of this closure is the (modified) output of
#' \code{tensorflow::tf$data$Dataset$from_tensor_slices} (an object of class
#' "tensorflow.python.data.ops.dataset_ops.BatchDataset"). This function is
#' similar to \code{input_fn_builder} from run_classifier.R. (RBERT issue #28.)
#'
#' @param features A list of features (objects of class
#'   \code{InputFeatures_EF}).
#' @param seq_length Integer; the maximum length (number of tokens) of each
#'   example. (Examples should already be padded to this length by this point.)
#'
#' @return An \code{input_fn} closure to be passed to TPUEstimator.
#' @keywords internal
input_fn_builder_EF <- function(features,
                                seq_length) {
  all_unique_ids <- purrr::map(
    features,
    function(f) {
      as.integer(f$unique_id)
    }
  )
  all_input_ids <- purrr::map(
    features,
    function(f) {
      as.integer(f$input_ids)
    }
  )
  all_input_mask <- purrr::map(
    features,
    function(f) {
      as.integer(f$input_mask)
    }
  )
  all_input_type_ids <- purrr::map(
    features,
    function(f) {
      as.integer(f$input_type_ids)
    }
  )

  input_fn <- function(params) {
    batch_size <- params$batch_size

    num_examples <- length(features)

    # "This is for demo purposes and does NOT scale to large data sets. We do
    #  not use Dataset.from_generator() because that uses tf.py_func which is
    #  not TPU compatible. The right way to load data is with TFRecordReader."

    d <- tensorflow::tf$data$Dataset$from_tensor_slices(
      # "A nested structure of tensors, each having the same size in the 0th
      # dimension." Try as a list. -JDB
      list(
        "unique_ids" = tensorflow::tf$constant(
          all_unique_ids,
          shape = tensorflow::shape(num_examples),
          dtype = tensorflow::tf$int32
        ),
        "input_ids" = tensorflow::tf$constant(
          all_input_ids,
          shape = tensorflow::shape(num_examples, seq_length),
          dtype = tensorflow::tf$int32
        ),
        "input_mask" = tensorflow::tf$constant(
          all_input_mask,
          shape = tensorflow::shape(num_examples, seq_length),
          dtype = tensorflow::tf$int32
        ),
        "input_type_ids" = tensorflow::tf$constant(
          all_input_type_ids,
          shape = tensorflow::shape(num_examples, seq_length),
          dtype = tensorflow::tf$int32
        )
      )
    )

    d <- d$batch(
      batch_size = batch_size,
      drop_remainder = FALSE
    )
    return(d) # return from `input_fn`
  }
  return(input_fn)
}
# .model_fn_builder_EF -----------------------------------------------------

#' Define \code{model_fn} closure for \code{TPUEstimator}
#'
#' Returns \code{model_fn} closure, which is an input to \code{TPUEstimator}.
#' This function is similar to \code{model_fn_builder} from run_classifier.R.
#' (RBERT issue #28.)
#'
#' The \code{model_fn} function takes four parameters: \describe{
#' \item{features}{A list (or similar structure) that contains objects such as
#' \code{input_ids}, \code{input_mask}, \code{tokens},  and
#' \code{input_type_ids}. These objects will be inputs to the
#' \code{create_model} function.} \item{labels}{Not used in this function, but
#' presumably we need to keep this slot here.} \item{mode}{Character; value such
#' as "train", "infer", or "eval".} \item{params}{Not used in this function, but
#' presumably we need to keep this slot here.} }
#'
#' The output of \code{model_fn} is the result of a
#' \code{tf$contrib$tpu$TPUEstimatorSpec} call.
#'
#' @param bert_config \code{BertConfig} instance.
#' @param init_checkpoint Character; path to the checkpoint directory, plus
#'   checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
#'   explicit, starting with "/".
#' @param layer_indexes Integer list; indexes (positive, or negative counting
#'   back from the end) indicating which layers to extract as "output features".
#'   (It needs to be specified here because we get them back as the model
#'   "predictions".)
#' @param use_tpu Logical; whether to use TPU.
#'
#' @return \code{model_fn} closure for \code{TPUEstimator}.
#' @keywords internal
.model_fn_builder_EF <- function(bert_config,
                                 init_checkpoint,
                                 layer_indexes,
                                 use_tpu) {
  # The `model_fn` for TPUEstimator.
  model_fn <- function(features, labels, mode, params) {
    unique_ids <- features$unique_ids
    input_ids <- features$input_ids
    input_mask <- features$input_mask
    input_type_ids <- features$input_type_ids
    input_shape <- get_shape_list(input_ids, expected_rank = 2L)

    model <- BertModel(
      config = bert_config,
      is_training = FALSE,
      input_ids = input_ids,
      input_mask = input_mask,
      token_type_ids = input_type_ids
    )

    if (mode != tensorflow::tf$estimator$ModeKeys$PREDICT) {
      stop("Only PREDICT modes are supported.") # nocov
    }

    tvars <- tensorflow::tf$trainable_variables()
    initialized_variable_names <- list()
    scaffold_fn <- NULL

    gamap <- get_assignment_map_from_checkpoint(tvars, init_checkpoint)
    assignment_map <- gamap$assignment_map
    initialized_variable_names <- gamap$initialized_variable_names
    if (use_tpu) { # nocov start
      tpu_scaffold <- function() {
        tensorflow::tf$train$init_from_checkpoint(
          init_checkpoint,
          assignment_map
        )
        return(tensorflow::tf$train$Scaffold())
      }
      scaffold_fn <- tpu_scaffold # nocov end
    } else {
      tensorflow::tf$train$init_from_checkpoint(
        init_checkpoint,
        assignment_map
      )
    }

    all_layers <- model$all_encoder_layers

    # ATTN: modified below to get attention_data from model
    attention_data <- model$attention_data
    # ATTN: modified above to get attention_data from model

    predictions <- list()
    predictions[["unique_id"]] <- unique_ids

    # Always include raw embeddings as the zeroth layer "output". We'll filter
    # them back out if we don't want them.
    predictions[["layer_output_0"]] <- model$embedding_output

    for (i in seq_along(layer_indexes)) {
      layer_index <- layer_indexes[[i]]
      # Accomodate both positive and negative indices.
      # Note that `all_layers` is 1-indexed!
      actual_index <- .get_actual_index(layer_index, length(all_layers))
      # For clarity, always use actual index to label outputs.
      key_str <- paste0("layer_output_", actual_index)
      predictions[[key_str]] <- all_layers[[actual_index]]

      # ATTN: modified below to include attention_data in "predictions"
      attn_key_str <- paste0("layer_attention_", actual_index)
      predictions[[attn_key_str]] <- attention_data[[actual_index]]
      # ATTN: modified above to include attention_data in "predictions"
    }
    output_spec <- tensorflow::tf$contrib$tpu$TPUEstimatorSpec(
      mode = mode,
      predictions = predictions,
      scaffold_fn = scaffold_fn
    )
    return(output_spec)
  } # end of `model_fn` definition
  return(model_fn)
}



# .convert_single_example_EF -----------------------------------------------

#' Convert a single \code{InputExample_EF} into a single \code{InputFeatures_EF}
#'
#' Converts a single \code{InputExample_EF} into a single
#' \code{InputFeatures_EF}. Very similar to \code{convert_single_example} from
#' run_classifier.R. (RBERT issue #28.)
#'
#' @param ex_index Integer; the index of this example. This is used to determine
#'   whether or not to print out some log info (for debugging or runtime
#'   confirmation). It is assumed this starts with 1 (in R).
#' @param example The \code{InputExample_EF} to convert.
#' @param seq_length Integer; the maximum number of tokens that will be
#'   considered together.
#' @param tokenizer A tokenizer object to use (e.g. object of class
#'   FullTokenizer).
#'
#' @return An object of class \code{InputFeatures_EF}.
#' @keywords internal
.convert_single_example_EF <- function(ex_index,
                                       example,
                                       seq_length,
                                       tokenizer) {

  # note use of S3 classes for dispatch, not methods.
  tokens_a <- tokenize(tokenizer, example$text_a)
  tokens_b <- NULL
  if (!is.null(example$text_b)) {
    tokens_b <- tokenize(tokenizer, example$text_b)
  }

  if (!is.null(tokens_b)) {
    # Modifies `tokens_a` and `tokens_b` so that the total length is less than
    # the specified length. Account for [CLS], [SEP], [SEP] with "- 3"
    truncated_seq <- truncate_seq_pair(
      tokens_a, tokens_b,
      seq_length - 3
    )
    tokens_a <- truncated_seq$trunc_a
    tokens_b <- truncated_seq$trunc_b
  } else {
    # Account for [CLS] and [SEP] with "- 2"
    if (length(tokens_a) > seq_length - 2) {
      tokens_a <- tokens_a[1:(seq_length - 2)]
    }
  }

  # The convention in BERT is:
  # (a) For sequence pairs:
  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
  # (b) For single sequences:
  #  tokens:   [CLS] the dog is hairy . [SEP]
  #  type_ids: 0     0   0   0  0     0 0
  #
  # Where "type_ids" are used to indicate whether this is the first
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
  # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is
  # used as the "sentence vector". Note that this only makes sense because
  # the entire model is fine-tuned.

  # The next few lines of code just insert the "[CLS]" and "[SEP]" tokens
  # in the appropriate places, and create the type_ids list.  -JDB
  cls_token <- "[CLS]"
  sep_token <- "[SEP]"
  tokens <- unlist(list(cls_token, tokens_a, sep_token))
  input_type_ids <- rep(0, length(tokens))

  if (!is.null(tokens_b)) {
    tokens2 <- unlist(list(tokens_b, sep_token))
    input_type_ids2 <- rep(1, length(tokens2))
    tokens <- c(tokens, tokens2)
    input_type_ids <- c(input_type_ids, input_type_ids2)
  }
  input_ids <- convert_tokens_to_ids(tokenizer$vocab, tokens)

  # The mask has 1 for real tokens and 0 for padding tokens. Only real
  # tokens are attended to.
  input_mask <- rep(1, length(input_ids))

  # Zero-pad up to the sequence length.
  pad_length <- seq_length - length(input_ids)
  padding <- rep(0, pad_length)
  input_ids <- c(input_ids, padding)
  input_mask <- c(input_mask, padding)
  input_type_ids <- c(input_type_ids, padding)

  # Stop now if the lengths aren't right somehow. -JDB
  if (length(input_ids) != seq_length |
    length(input_mask) != seq_length |
    length(input_type_ids) != seq_length) {
    stop("input_ids, input_mask, or input_type_ids are wrong length.") # nocov
  }

  feature <- .InputFeatures_EF(
    unique_id = example$unique_id,
    tokens = tokens,
    input_ids = input_ids,
    input_mask = input_mask,
    input_type_ids = input_type_ids
  )

  return(feature)
}

# .convert_examples_to_features_EF -----------------------------------------

#' Convert \code{InputExample_EF}s to \code{InputFeatures_EF}
#'
#' Converts a set of \code{InputExample_EF}s to a list of
#' \code{InputFeatures_EF}. Very similar to \code{convert_examples_to_features}
#' from run_classifier.R. (RBERT issue #28.)
#'
#' @param examples List of \code{InputExample_EF}s to convert.
#' @param seq_length Integer; the maximum number of tokens that will be
#'   considered together.
#' @param tokenizer A tokenizer object to use (e.g. object of class
#'   FullTokenizer).
#'
#' @return A list of \code{InputFeatures}.
#' @keywords internal
.convert_examples_to_features_EF <- function(examples,
                                             seq_length,
                                             tokenizer) {
  # I have no idea why they had to rename the elements of examples/features
  # and then recreate these functions to handle the slightly different versions.
  # Whatever. We will clean up later. -JDB
  example_indices <- seq_along(examples)
  num_examples <- length(examples)
  features <- purrr::map2(
    example_indices,
    examples,
    function(ex_index, example,
             seq_length, tokenizer) {
      .convert_single_example_EF(
        ex_index = ex_index,
        example = example,
        seq_length = seq_length,
        tokenizer = tokenizer
      )
    },
    seq_length, tokenizer
  )
  return(features)
}



# extract_features --------------------------------------------------------

#' Extract output features from BERT
#'
#' Given example sentences (as a list of \code{InputExample_EF}s), apply an
#' existing BERT model and capture certain output layers. (These could
#' potentially be used as features in downstream tasks.)
#'
#' @param examples List of \code{InputExample_EF} objects, or character
#'   vector(s) that can be converted to \code{InputExample_EF} objects.
#' @param model Character; which model checkpoint to use. If specified,
#'   \code{ckpt_dir}, code{vocab_file}, \code{bert_config_file}, and
#'   \code{init_checkpoint} will be inferred. If you do not have this
#'   checkpoint, you will be prompted to download it in interactive mode.
#' @param ckpt_dir Character; path to checkpoint directory. If specified, any
#'   other checkpoint files required by this function (\code{vocab_file},
#'   \code{bert_config_file}, or \code{init_checkpoint}) will default to
#'   standard filenames within \code{ckpt_dir}.
#' @param vocab_file path to vocabulary file. File is assumed to be a text file,
#'   with one token per line, with the line number corresponding to the index of
#'   that token in the vocabulary.
#' @param bert_config_file Character; the path to a json config file.
#' @param init_checkpoint Character; path to the checkpoint directory, plus
#'   checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
#'   explicit, starting with "/".
#' @param output_file (optional) Character; file path (stub) for writing output
#'   to.
#' @param max_seq_length Integer; the maximum number of tokens that will be
#'   considered together.
#' @param layer_indexes Integer vector; indexes (positive, or negative counting
#'   back from the end) indicating which layers to extract as "output features".
#'   The "zeroth" layer embeddings are the input embeddings vectors to the first
#'   layer.
#' @param batch_size Integer; how many examples to process per batch.
#' @param features Character; whether to return "output" (layer outputs, the
#'   default), "attention" (attention probabilities), or both.
#' @param verbose Logical; if FALSE, suppresses most of the TensorFlow chatter
#'   by temporarily setting the logging threshold to its highest level. If TRUE,
#'   keeps the current logging threshold, which defaults to "WARN". To change
#'   the logging threshold of the current session, run
#'   \code{tensorflow::tf$logging$set_verbosity(tensorflow::tf$logging$DEBUG)}
#'   (setting whatever verbosity level you want).
#'
#' @return A list with elements "output" (the layer outputs as a tibble) and/or
#'   "attention" (the attention weights as a tibble).
#' @export
#'
#' @examples
#' \dontrun{
#' BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased")
#' examples <- c("I saw the branch on the bank.",
#'               "I saw the branch of the bank.")
#'
#' # Just specify checkpoint directory.
#' feats <- extract_features(
#'   examples = examples,
#'   ckpt_dir = BERT_PRETRAINED_DIR
#' )
#' # Can also just specify the model, if you have it downloaded.
#' # In interactive mode, you'll be prompted to download the model if you do not
#' # have it.
#' feats <- extract_features(
#'   examples = examples,
#'   model = "bert_base_uncased"
#' )
#' }
extract_features <- function(examples,
                             model = c(
                               "bert_base_uncased",
                               "bert_base_cased",
                               "bert_large_uncased",
                               "bert_large_cased",
                               "bert_large_uncased_wwm",
                               "bert_large_cased_wwm",
                               "bert_base_multilingual_cased",
                               "bert_base_chinese",
                               "scibert_scivocab_uncased",
                               "scibert_scivocab_cased",
                               "scibert_basevocab_uncased",
                               "scibert_basevocab_cased"
                             ),
                             ckpt_dir = NULL,
                             vocab_file = find_vocab(ckpt_dir),
                             bert_config_file = find_config(ckpt_dir),
                             init_checkpoint = find_ckpt(ckpt_dir),
                             output_file = NULL,
                             max_seq_length = 128L,
                             layer_indexes = -4:-1,
                             batch_size = 2L,
                             features = c(
                               "output",
                               "attention"
                             ),
                             verbose = FALSE) {
  if (is.character(examples)) {
    examples <- make_examples_simple(seq_list = examples)
  } else if (inherits(examples, "InputExample_EF")) {
    # As a courtesy, now allow a single InputExample_EF object to be passed in.
    examples <- list(examples)
  } else {
    is_correct_class <- purrr::map_lgl(
      examples,
      function(e) {
        inherits(e, "InputExample_EF")
      }
    )
    if (!all(is_correct_class)) {
      # Assume they can be converted.
      # Let `make_examples_simple` handle any messaging.
      examples <- make_examples_simple(seq_list = examples)
    }
  }

  model_paths <- .infer_model_paths(
    model, ckpt_dir, vocab_file, bert_config_file, init_checkpoint
  )

  vocab_file <- model_paths$vocab_file
  bert_config_file <- model_paths$bert_config_file
  init_checkpoint <- model_paths$init_checkpoint

  if (missing(features)) {
    features <- "output"
  }
  features <- match.arg(features, several.ok = TRUE)

  include_zeroth <- FALSE
  if (0 %in% layer_indexes) {
    include_zeroth <- TRUE
    layer_indexes <- layer_indexes[layer_indexes != 0]
  }

  old_verbosity_level <- tensorflow::tf$logging$get_verbosity()
  old_sys_log_level <- Sys.getenv("TF_CPP_MIN_LOG_LEVEL")
  if (!verbose) {
    tensorflow::tf$logging$set_verbosity(tensorflow::tf$logging$FATAL)
    Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3")
  }

  bert_config <- bert_config_from_json_file(bert_config_file)
  n_layers <- bert_config$num_hidden_layers
  layer_indexes_actual <- .get_actual_indexes(layer_indexes, n_layers)
  n_dimensions <- bert_config$hidden_size
  tokenizer <- FullTokenizer(
    vocab_file = vocab_file,
    do_lower_case = TRUE
  )
  is_per_host <- tensorflow::tf$contrib$tpu$InputPipelineConfig$PER_HOST_V2

  run_config <- tensorflow::tf$contrib$tpu$RunConfig(
    master = NULL, # assume for now *not* for TPU
    tpu_config = tensorflow::tf$contrib$tpu$TPUConfig(
      num_shards = 8L,
      per_host_input_for_training = is_per_host
    )
  )

  raw_features <- .convert_examples_to_features_EF(
    examples = examples,
    seq_length = max_seq_length,
    tokenizer = tokenizer
  )
  unique_id_to_feature <- list()
  for (feature in raw_features) {
    unique_id_to_feature[[feature$unique_id]] <- feature
  }

  model_fn <- .model_fn_builder_EF(
    bert_config = bert_config,
    init_checkpoint = init_checkpoint,
    layer_indexes = layer_indexes_actual,
    use_tpu = FALSE
  )

  estimator <- tensorflow::tf$contrib$tpu$TPUEstimator(
    use_tpu = FALSE, # no tpu support for now
    model_fn = reticulate::py_func(model_fn),
    config = run_config,
    predict_batch_size = batch_size
  )

  input_fn <- input_fn_builder_EF(
    features = raw_features,
    seq_length = max_seq_length
  )

  result_iterator <- estimator$predict(reticulate::py_func(input_fn),
    yield_single_examples = TRUE
  )


  # Set up the needed lists. They'll be filled in the for below.
  big_output <- NULL
  big_attention <- NULL
  wants_output <- "output" %in% features
  wants_attention <- "attention" %in% features
  if (wants_output) {
    big_output <- tibble::tibble(
      sequence_index = integer(),
      segment_index = integer(),
      token_index = integer(),
      token = character(),
      layer_index = integer()
    )
    for (colname in paste0("V", seq_len(n_dimensions))) {
      big_output[[colname]] <- integer()
    }
    layer_indexes_output <- layer_indexes_actual
    if (include_zeroth) {
      layer_indexes_output <- c(0L, layer_indexes_output)
    }
  }
  if (wants_attention) {
    big_attention <- tibble::tibble(
      sequence_index = integer(),
      token_index = integer(),
      segment_index = integer(),
      token = character(),
      layer_index = integer(),
      head_index = integer(),
      attention_token_index = integer(),
      attention_segment_index = integer(),
      attention_token = character(),
      attention_weight = double()
    )
  }
  token_map <- tibble::tibble(
    sequence_index = integer(),
    token_index = integer(),
    segment_index = integer(),
    token = character()
  )

  # For speed testing.
  # wants_output <- FALSE
  # wants_attention <- FALSE

  # result_iterator should have an entry available for each sequence.
  for (i in seq_along(examples)) {
    result <- if ("next" %in% names(result_iterator)) {
      result_iterator$`next`() # nocov
    } else {
      result_iterator$`__next__`() # nocov
    }

    tokens <- unique_id_to_feature[[result$unique_id]]$tokens
    token_seq <- seq_along(tokens)
    token_map <- dplyr::bind_rows(
      token_map,
      data.frame(
        sequence_index = as.integer(result$unique_id),
        token_index = token_seq,
        token = tokens,
        stringsAsFactors = FALSE
      )
    )

    if (wants_output) {
      big_output <- .process_output_result(
        big_output,
        result,
        layer_indexes_output,
        token_seq
      )
    }

    if (wants_attention) {
      big_attention <- .process_attention_result(
        big_attention,
        result,
        layer_indexes_actual,
        token_seq
      )
    }
  }

  # restore original verbosity levels
  tensorflow::tf$logging$set_verbosity(old_verbosity_level)
  Sys.setenv(TF_CPP_MIN_LOG_LEVEL = old_sys_log_level)

  # Iterate one more time to let python finish and be happy.
  result <- tryCatch(
    {
      if ("next" %in% names(result_iterator)) {
        result_iterator$`next`() # nocov
      } else {
        result_iterator$`__next__`() # nocov
      }
    },
    error = function(e) {
      FALSE
      # If we get error, `result` will be assigned this FALSE.
      # The only way to tell we've reached the end is to get an error. :-/
    }
  )
  if (!identical(result, FALSE)) {
    stop("More results returned than sequences.") # nocov
  }

  # Clean up features.
  token_map <- .infer_segment_index(token_map, sequence_index)
  if (wants_output) {
    big_output <- .finalize_output(big_output, token_map)
  }
  if (wants_attention) {
    big_attention <- .finalize_attention(big_attention, token_map)
  }

  # I do it this way so, if they're NULL, that value won't appear in the list,
  # rather than appearing there as "NULL" like it would if I set this up in one
  # step.
  to_return <- list()
  to_return$output <- big_output
  to_return$attention <- big_attention
  return(to_return)
}


# .process_output_results -------------------------------------------------

.process_output_result <- function(big_output,
                                   result,
                                   layer_indexes_output,
                                   token_seq) {
  sequence_index <- as.integer(result$unique_id)
  result_output_names <- paste0("layer_output_", layer_indexes_output)
  sequence_outputs <- purrr::imap_dfr(
    result_output_names,
    function(this_layer, this_index) {
      this_output <- as.data.frame(
        result[[this_layer]][token_seq, ]
      )
      this_output[["layer_index"]] <- layer_indexes_output[[this_index]]
      this_output
    }
  )
  sequence_outputs[["token_index"]] <- rep(
    token_seq, length(result_output_names)
  )
  sequence_outputs[["sequence_index"]] <- sequence_index
  return(
    dplyr::bind_rows(
      big_output,
      sequence_outputs
    )
  )
}

.infer_segment_index <- function(feature_tibble, ...) {
  return(
    dplyr::ungroup(
      dplyr::select(
        dplyr::mutate(
          dplyr::group_by(
            dplyr::mutate(
              feature_tibble,
              is_sep = token == "[SEP]"
            ),
            ...
          ),
          segment_index = cumsum(is_sep) - is_sep + 1L
        ),
        -is_sep
      )
    )
  )
}

.finalize_output <- function(big_output, token_map) {
  return(
    dplyr::select(
      dplyr::mutate(
        dplyr::left_join(
          big_output, token_map,
          by = c("sequence_index", "token_index"),
          suffix = c("", "_fill")
        ),
        segment_index = segment_index_fill,
        token = token_fill
      ),
      -dplyr::ends_with("_fill")
    )
  )
}


# .process_attention_results ----------------------------------------------

.process_attention_result <- function(big_attention,
                                      result,
                                      layer_indexes_actual,
                                      token_seq) {
  result_attention_names <- paste0("layer_attention_", layer_indexes_actual)

  sequence_attention <- purrr::imap_dfr(
    result_attention_names,
    function(this_layer, this_index) {
      this_attention <- tidyr::unnest_longer(
        tidyr::unnest_longer(
          tibble::enframe(
            purrr::array_tree(
              result[[this_layer]][, token_seq, token_seq]
            ),
            name = "head_index"
          ),
          value,
          indices_to = "token_index"
        ),
        value,
        indices_to = "attention_token_index",
        values_to = "attention_weight"
      )
      this_attention$layer_index <- layer_indexes_actual[[this_index]]
      this_attention
    }
  )
  sequence_attention$sequence_index <- as.integer(result$unique_id)
  big_attention <- dplyr::bind_rows(
    big_attention,
    sequence_attention
  )

  return(big_attention)
}

.finalize_attention <- function(big_attention,
                                token_map) {
  return(
    dplyr::select(
      dplyr::mutate(
        dplyr::left_join(
          dplyr::select(
            dplyr::mutate(
              dplyr::left_join(
                big_attention, token_map,
                by = c("sequence_index", "token_index"),
                suffix = c("", "_fill")
              ),
              segment_index = segment_index_fill,
              token = token_fill
            ),
            -dplyr::ends_with("_fill")
          ),
          token_map,
          by = c("sequence_index", "attention_token_index" = "token_index"),
          suffix = c("", "_fill")
        ),
        attention_segment_index = segment_index_fill,
        attention_token = token_fill
      ),
      -dplyr::ends_with("_fill")
    )
  )
}


# .get_actual_index ---------------------------------------------------

#' Standardize Indices
#'
#' Convert negative indices to positive ones. Use the convention that
#' \code{vec[[-1L]]} signifies the last element of \code{vec}, \code{vec[[-2L]]}
#' signifies the second-to-last element of \code{vec}, and so on. 1-based
#' indexing is assumed. Values of zero, or out-of-range indices, will be
#' rejected.
#'
#' @param index Integer; the index to normalize.
#' @param length Integer; the length of the vector or list we are indexing.
#'
#' @return The "actual" integer index, between 1 and \code{length}, inclusive.
#' @keywords internal
.get_actual_index <- function(index,
                              length) {
  index <- as.integer(index)
  if (abs(index) > length) {
    stop(paste(
      "Index out of range.",
      "Absolute value of index must be within specified length."
    ))
  } else if (index == 0) {
    stop(paste(
      "Ambiguous index.",
      "Only strictly positive or negative indices accepted."
    ))
  } else if (index < 0) {
    return(as.integer((length + index) %% length + 1L))
  } else {
    return(index)
  }
}

.get_actual_indexes <- function(indexes, object_length) {
  return(
    purrr::map_int(indexes, .get_actual_index, object_length)
  )
}

# make_examples_simple ----------------------------------------------------

#' Easily make examples for BERT
#'
#' A simple wrapper function to turn a list of text (as a character
#' vector or list) into a list of examples suitable for use with RBERT. If the
#' input is a flat list or vector of characters, the examples will be
#' single-segment, with NULL for the second segment. If the input contains
#' length-2 sublists or vectors, those examples will be two-segment sequences,
#' e.g. for doing sentence-pair classification.
#'
#' @param seq_list Character vector or list; text to turn into examples.
#'
#' @return A list of \code{InputExample_EF} objects.
#' @export
#'
#' @examples
#' input_ex <- make_examples_simple(c(
#'   "Here are some words.",
#'   "Here are some more words."
#' ))
#' input_ex2 <- make_examples_simple(list(
#'   c(
#'     "First sequence, first segment.",
#'     "First sequence, second segment."
#'   ),
#'   c(
#'     "Second sequence, first segment.",
#'     "Second sequence, second segment."
#'   )
#' ))
make_examples_simple <- function(seq_list) {
  if (any(purrr::map_int(seq_list, length) > 2)) {
    warning(
      "Examples must contain at most two distinct segments. ",
      "Segments beyond the second will be ignored."
    )
  }
  seq_nums <- seq_along(seq_list)
  purrr::map(seq_nums, function(sn) {
    first_segment <- seq_list[[sn]][[1]]
    second_segment <- NULL
    if (length(seq_list[[sn]]) > 1) {
      second_segment <- seq_list[[sn]][[2]]
    }
    InputExample_EF(
      unique_id = sn,
      text_a = first_segment,
      text_b = second_segment
    )
  })
}