Nothing
#' Generates n-gram models from a text file
#'
#' @description
#' It provides a method for generating n-gram models. The n-gram models may be
#' customized by specifying data cleaning and tokenization options.
#'
#' @details
#' It provides a method that generates a n-gram model. The n-gram model
#' may be customized by specifying the data cleaning and tokenization options.
#'
#' The data cleaning options include removal of punctuation, stop words, extra
#' space, non-dictionary words and bad words. The tokenization options include
#' n-gram number and word stemming.
ModelGenerator <- R6::R6Class(
"ModelGenerator",
inherit = Base,
public = list(
#' @description
#' It initializes the current object. It is used to set the maximum
#' n-gram number, sample size, input file name, data cleaner options,
#' tokenization options and verbose option.
#' @param name The model name.
#' @param desc The model description.
#' @param fn The model file name.
#' @param df The path of the input text file. It should be the short
#' file name and should be present in the data directory.
#' @param n The n-gram size of the model.
#' @param ssize The sample size as a proportion of the input file.
#' @param dir The directory containing the input and output files.
#' @param dc_opts The data cleaner options.
#' @param tg_opts The token generator options.
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(name = NULL,
desc = NULL,
fn = NULL,
df = NULL,
n = 4,
ssize = 0.3,
dir = ".",
dc_opts = list(),
tg_opts = list(),
ve = 0) {
# The base class is initialized
super$initialize(NULL, NULL, ve)
# An object of class Model is created
private$m <- Model$new(
name = name,
desc = desc,
fn = fn,
df = df,
n = n,
ssize = ssize,
dir = dir,
dc_opts = dc_opts,
tg_opts = tg_opts,
ve = ve
)
},
#' @description
#' It generates the model using the parameters passed to
#' the object's constructor. It generates a n-gram model file and saves
#' it to the model directory.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("input.txt")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # ModelGenerator class object is created
#' mg <- ModelGenerator$new(
#' name = "default-model",
#' desc = "1 MB size and default options",
#' fn = "def-model.RDS",
#' df = "input.txt",
#' n = 4,
#' ssize = 0.99,
#' dir = ed,
#' dc_opts = list(),
#' tg_opts = list(),
#' ve = ve
#' )
#' # The n-gram model is generated
#' mg$generate_model()
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
generate_model = function() {
# The information message is displayed
private$dh("Generating n-gram model", "-", md = 1)
# The cleaned sample data file is generated
private$generate_sample()
# The data files are generated
private$generate_data_files()
# The n-gram tokens are generated
private$generate_ngram_tokens()
# The tp data is generated
private$generate_tp_data()
# The model is saved
private$save_model()
# The information message is shown
private$dh("DONE", "=", md = 1)
}
),
private = list(
# @field m The model object.
m = NULL,
# @description
# Saves the model to a file
save_model = function() {
# The directory path
dir <- private$m$get_config("dir")
# The model file name
ofn <- private$m$get_config("fn")
# The output file path
ofp <- paste0(dir, "/", ofn)
# The information message is shown
private$dh("Saving model", "-", md = 1)
# The model object is loaded
private$m$load_model()
# The model object is saved to the models folder using the output
# file name
private$save_obj(private$m, ofp)
},
# @description
# Generates a cleaned sample file of given size from the
# given input data file. The name of the output file is
# sample-clean.txt.
generate_sample = function() {
# The input data file name
df <- private$m$get_config("df")
# The sample size
ssize <- private$m$get_config("ssize")
# The directory path
dir <- private$m$get_config("dir")
# The data cleaning options
dc_opts <- private$m$get_config("dc_opts")
# If the output file name is not set
if (is.null(dc_opts[["output_file"]])) {
# The output file name
dc_opts[["output_file"]] <- paste0(dir, "/sample-clean.txt")
}
# The DataSampler object is created
ds <- DataSampler$new(dir = dir, ve = private$ve)
# Sample is taken and cleaned
ds$generate_sample(df, ssize, T, F, "sample.txt", T, dc_opts)
},
# @description
# Generates test, train and validation files from the
# cleaned sample file. The name of the output files are train.txt,
# test.txt and validation.txt.
generate_data_files = function() {
# The directory path
dir <- private$m$get_config("dir")
# The DataSampler object is created
ds <- DataSampler$new(dir = dir, ve = private$ve)
# The training, testing and validation data sets are generated
ds$generate_data("sample-clean.txt", list(
train = .8,
test = .1,
validate = .1
))
},
# @description
# Generates transition probabilities data from n-gram token
# file. The transition probabilties data is saved as files.
generate_tp_data = function() {
# The n-gram number
n <- private$m$get_config("n")
# The directory path
dir <- private$m$get_config("dir")
# The options for generating combined transition probabilities
tp_opts <- list(
"n" = n,
"save_tp" = T,
"format" = "obj",
"dir" = dir
)
# The TPGenerator object is created
tp <- TPGenerator$new(tp_opts, private$ve)
# The transition probabilities are generated
tp$generate_tp()
},
# @description
# Generates n-gram tokens from the cleaned data input file.
# The n-gram tokens are saved as files.
generate_ngram_tokens = function() {
# The n-gram number
n <- private$m$get_config("n")
# The directory path
dir <- private$m$get_config("dir")
# The TokenGenerator object options
tg_opts <- private$m$get_config("tg_opts")
# The directory is set
tg_opts$dir <- dir
# The clean train data file name
fn <- paste0(dir, "/train.txt")
# For each n-gram number, the n-gram token file is generated
for (i in 1:n) {
# The n-gram number is set
tg_opts$n <- i
# The TokenGenerator object is created
tg <- TokenGenerator$new(fn, tg_opts, private$ve)
# The n-gram tokens are generated
tg$generate_tokens()
}
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.