R/keras_helpers.R

#'
#'Constants for keras model training
#'
keras_config_params <- list(
  "maxlen" = 30L,
  "max_words" = 10000L,
  "embedding_dim" = 100L
)

#' Get Tokenizer for tokenizing train/test data
#'
#' @name get_tokenizer
#'
#' @description Get Tokenizer for tokenizing train/test data
#'
#' @param data Data to fit tokenizer on.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @return tokenizer object
#'
#' @importFrom magrittr %>%
#' @importFrom keras text_tokenizer fit_text_tokenizer
#'
get_tokenizer <- function(data, max_words) {
  tokenizer <- keras::text_tokenizer(num_words = max_words) %>%
    keras::fit_text_tokenizer(data$text)
  return(tokenizer)
}

#' Create LSTM model object
#'
#' @name create_lstm_model
#'
#' @description Create LSTM model object
#'
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param maxlen Maximum length of a sequence.
#' @param lstm_units Number of units i.e. output dimension of lstm layer.
#' @return LSTM model object
#'
#' @importFrom magrittr %>%
#' @importFrom keras keras_model_sequential layer_embedding layer_lstm layer_dense
#'
create_lstm_model <- function(max_words, embedding_dim, maxlen, lstm_units) {
  model <- keras::keras_model_sequential() %>%
    keras::layer_embedding(input_dim = max_words,
                           output_dim = embedding_dim,
                           input_length = maxlen) %>%
    keras::layer_lstm(units = lstm_units) %>%
    keras::layer_dense(units = 1,
                       activation = "sigmoid")
  summary(model)

  return(model)
}

#' Create 1-Dimensional Convolutional Network model object
#'
#' @name create_conv1d_model
#'
#' @description Create 1-Dimensional Convolutional Network model object
#'
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param maxlen Maximum length of a sequence.
#' @param conv1d_filters Number of filters i.e. output dimension for convolution layers.
#' @param conv1d_kernel_size Window size for convolution layers.
#' @param conv1d_pool_size Pool size for max pooling.
#' @return 1-Dimensional Convolutional Network model object
#'
#' @importFrom magrittr %>%
#' @importFrom keras keras_model_sequential layer_embedding layer_conv_1d layer_max_pooling_1d layer_global_max_pooling_1d layer_dense
#'
create_conv1d_model <- function(max_words, embedding_dim, maxlen, conv1d_filters, conv1d_kernel_size, conv1d_pool_size) {
  model <- keras::keras_model_sequential() %>%
    keras::layer_embedding(input_dim = max_words,
                           output_dim = embedding_dim,
                           input_length = maxlen) %>%
    keras::layer_conv_1d(filters = conv1d_filters,
                         kernel_size = conv1d_kernel_size,
                         activation = "relu") %>%
    keras::layer_max_pooling_1d(pool_size = conv1d_pool_size) %>%
    keras::layer_conv_1d(filters = conv1d_filters,
                         kernel_size = conv1d_kernel_size,
                         activation = "relu") %>%
    keras::layer_global_max_pooling_1d() %>%
    keras::layer_dense(units = 1, activation = "sigmoid")

  summary(model)

  return(model)
}

#' Generate training data
#'
#' @name generate_training_data
#'
#' @description Generate training data
#'
#' @param data Input data.
#' @param sequences sequences generated by using tokenizer on text
#' @param maxlen Maximum length of a sequence.
#' @param seed Seed for shuffling data
#' @return List object with \code{x_train} and \code{y_train} objects.
#'
#' @importFrom keras pad_sequences
#'
generate_training_data <- function(data, sequences, maxlen, seed) {
  #Padding sequences to common length
  padded_data <- keras::pad_sequences(sequences, maxlen = maxlen)
  labels <- as.array(data$polarity)
  cat("Shape of data tensor:", dim(padded_data), "\n")
  cat("Shape of label tensor:", dim(labels), "\n")

  #Shuffling data
  set.seed(seed)
  indices <- sample(1:nrow(padded_data))
  x_train <- padded_data[indices, ]
  y_train <- labels[indices]

  dim(x_train)
  dim(y_train)

  result <- list(
    x_train = x_train,
    y_train = y_train
  )

  return(result)
}

#' Parse glove embeddings
#'
#' @name parse_glove_embeddings
#'
#' @description Parse glove embeddings
#'
#' @param file_path File path location of the downloaded glove embeddings.
#' @return Parsed embeddings index object.
#'
parse_glove_embeddings <- function(file_path) {
  lines <- readLines(file_path)
  embeddings_index <- new.env(hash = TRUE, parent = emptyenv())
  for (i in 1:length(lines)) {
    line <- lines[[i]]
    values <- strsplit(line, " ")[[1]]
    word <- values[[1]]
    embeddings_index[[word]] <- as.double(values[-1])
  }
  cat("Found", length(embeddings_index), "word vectors.\n")
  return(embeddings_index)
}

#' Generate Embedding Matrix for given word index
#'
#' @name generate_embedding_matrix
#'
#' @description Generate Embedding Matrix for given word index
#'
#' @param word_index word index as generated by the tokenizer.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param glove_file_path File path location for glove embeddings.
#' @return Embedding Matrix object
#'
generate_embedding_matrix <- function(word_index, embedding_dim, max_words, glove_file_path) {
  embeddings_index <- parse_glove_embeddings(glove_file_path)

  embedding_matrix <- array(0, c(max_words, embedding_dim))
  for (word in names(word_index)) {
    index <- word_index[[word]]
    if (index < max_words) {
      embedding_vector <- embeddings_index[[word]]
      if (!is.null(embedding_vector)) {
        embedding_matrix[index+1,] <- embedding_vector
      }
    }
  }

  return(embedding_matrix)
}
adityamangal410/deepSentimentR documentation built on June 3, 2019, 6:15 p.m.