#'
#'Constants for keras model training
#'
keras_config_params <- list(
"maxlen" = 30L,
"max_words" = 10000L,
"embedding_dim" = 100L
)
#' Get Tokenizer for tokenizing train/test data
#'
#' @name get_tokenizer
#'
#' @description Get Tokenizer for tokenizing train/test data
#'
#' @param data Data to fit tokenizer on.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @return tokenizer object
#'
#' @importFrom magrittr %>%
#' @importFrom keras text_tokenizer fit_text_tokenizer
#'
get_tokenizer <- function(data, max_words) {
tokenizer <- keras::text_tokenizer(num_words = max_words) %>%
keras::fit_text_tokenizer(data$text)
return(tokenizer)
}
#' Create LSTM model object
#'
#' @name create_lstm_model
#'
#' @description Create LSTM model object
#'
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param maxlen Maximum length of a sequence.
#' @param lstm_units Number of units i.e. output dimension of lstm layer.
#' @return LSTM model object
#'
#' @importFrom magrittr %>%
#' @importFrom keras keras_model_sequential layer_embedding layer_lstm layer_dense
#'
create_lstm_model <- function(max_words, embedding_dim, maxlen, lstm_units) {
model <- keras::keras_model_sequential() %>%
keras::layer_embedding(input_dim = max_words,
output_dim = embedding_dim,
input_length = maxlen) %>%
keras::layer_lstm(units = lstm_units) %>%
keras::layer_dense(units = 1,
activation = "sigmoid")
summary(model)
return(model)
}
#' Create 1-Dimensional Convolutional Network model object
#'
#' @name create_conv1d_model
#'
#' @description Create 1-Dimensional Convolutional Network model object
#'
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param maxlen Maximum length of a sequence.
#' @param conv1d_filters Number of filters i.e. output dimension for convolution layers.
#' @param conv1d_kernel_size Window size for convolution layers.
#' @param conv1d_pool_size Pool size for max pooling.
#' @return 1-Dimensional Convolutional Network model object
#'
#' @importFrom magrittr %>%
#' @importFrom keras keras_model_sequential layer_embedding layer_conv_1d layer_max_pooling_1d layer_global_max_pooling_1d layer_dense
#'
create_conv1d_model <- function(max_words, embedding_dim, maxlen, conv1d_filters, conv1d_kernel_size, conv1d_pool_size) {
model <- keras::keras_model_sequential() %>%
keras::layer_embedding(input_dim = max_words,
output_dim = embedding_dim,
input_length = maxlen) %>%
keras::layer_conv_1d(filters = conv1d_filters,
kernel_size = conv1d_kernel_size,
activation = "relu") %>%
keras::layer_max_pooling_1d(pool_size = conv1d_pool_size) %>%
keras::layer_conv_1d(filters = conv1d_filters,
kernel_size = conv1d_kernel_size,
activation = "relu") %>%
keras::layer_global_max_pooling_1d() %>%
keras::layer_dense(units = 1, activation = "sigmoid")
summary(model)
return(model)
}
#' Generate training data
#'
#' @name generate_training_data
#'
#' @description Generate training data
#'
#' @param data Input data.
#' @param sequences sequences generated by using tokenizer on text
#' @param maxlen Maximum length of a sequence.
#' @param seed Seed for shuffling data
#' @return List object with \code{x_train} and \code{y_train} objects.
#'
#' @importFrom keras pad_sequences
#'
generate_training_data <- function(data, sequences, maxlen, seed) {
#Padding sequences to common length
padded_data <- keras::pad_sequences(sequences, maxlen = maxlen)
labels <- as.array(data$polarity)
cat("Shape of data tensor:", dim(padded_data), "\n")
cat("Shape of label tensor:", dim(labels), "\n")
#Shuffling data
set.seed(seed)
indices <- sample(1:nrow(padded_data))
x_train <- padded_data[indices, ]
y_train <- labels[indices]
dim(x_train)
dim(y_train)
result <- list(
x_train = x_train,
y_train = y_train
)
return(result)
}
#' Parse glove embeddings
#'
#' @name parse_glove_embeddings
#'
#' @description Parse glove embeddings
#'
#' @param file_path File path location of the downloaded glove embeddings.
#' @return Parsed embeddings index object.
#'
parse_glove_embeddings <- function(file_path) {
lines <- readLines(file_path)
embeddings_index <- new.env(hash = TRUE, parent = emptyenv())
for (i in 1:length(lines)) {
line <- lines[[i]]
values <- strsplit(line, " ")[[1]]
word <- values[[1]]
embeddings_index[[word]] <- as.double(values[-1])
}
cat("Found", length(embeddings_index), "word vectors.\n")
return(embeddings_index)
}
#' Generate Embedding Matrix for given word index
#'
#' @name generate_embedding_matrix
#'
#' @description Generate Embedding Matrix for given word index
#'
#' @param word_index word index as generated by the tokenizer.
#' @param embedding_dim Output dimension of the embedding layer.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param glove_file_path File path location for glove embeddings.
#' @return Embedding Matrix object
#'
generate_embedding_matrix <- function(word_index, embedding_dim, max_words, glove_file_path) {
embeddings_index <- parse_glove_embeddings(glove_file_path)
embedding_matrix <- array(0, c(max_words, embedding_dim))
for (word in names(word_index)) {
index <- word_index[[word]]
if (index < max_words) {
embedding_vector <- embeddings_index[[word]]
if (!is.null(embedding_vector)) {
embedding_matrix[index+1,] <- embedding_vector
}
}
}
return(embedding_matrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.