R/model-generator.R

#' Generates n-gram models from a text file
#'
#' @description
#' It provides a method for generating n-gram models. The n-gram models may be
#' customized by specifying data cleaning and tokenization options.
#'
#' @details
#' It provides a method that generates a n-gram model. The n-gram model
#' may be customized by specifying the data cleaning and tokenization options.
#'
#' The data cleaning options include removal of punctuation, stop words, extra
#' space, non-dictionary words and bad words. The tokenization options include
#' n-gram number and word stemming.
ModelGenerator <- R6::R6Class(
    "ModelGenerator",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current object. It is used to set the maximum
        #' n-gram number, sample size, input file name, data cleaner options,
        #' tokenization options and verbose option.
        #' @param name The model name.
        #' @param desc The model description.
        #' @param fn The model file name.
        #' @param df The path of the input text file. It should be the short
        #'   file name and should be present in the data directory.
        #' @param n The n-gram size of the model.
        #' @param ssize The sample size as a proportion of the input file.
        #' @param dir The directory containing the input and output files.
        #' @param dc_opts The data cleaner options.
        #' @param tg_opts The token generator options.
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(name = NULL,
                              desc = NULL,
                              fn = NULL,
                              df = NULL,
                              n = 4,
                              ssize = 0.3,
                              dir = ".",
                              dc_opts = list(),
                              tg_opts = list(),
                              ve = 0) {

            # The base class is initialized
            super$initialize(NULL, NULL, ve)
            # An object of class Model is created
            private$m <- Model$new(
                name = name,
                desc = desc,
                fn = fn,
                df = df,
                n = n,
                ssize = ssize,
                dir = dir,
                dc_opts = dc_opts,
                tg_opts = tg_opts,
                ve = ve
            )
        },

        #' @description
        #' It generates the model using the parameters passed to
        #' the object's constructor. It generates a n-gram model file and saves
        #' it to the model directory.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("input.txt")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # ModelGenerator class object is created
        #' mg <- ModelGenerator$new(
        #'     name = "default-model",
        #'     desc = "1 MB size and default options",
        #'     fn = "def-model.RDS",
        #'     df = "input.txt",
        #'     n = 4,
        #'     ssize = 0.99,
        #'     dir = ed,
        #'     dc_opts = list(),
        #'     tg_opts = list(),
        #'     ve = ve
        #' )
        #' # The n-gram model is generated
        #' mg$generate_model()
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        generate_model = function() {
            # The information message is displayed
            private$dh("Generating n-gram model", "-", md = 1)
            # The cleaned sample data file is generated
            private$generate_sample()
            # The data files are generated
            private$generate_data_files()
            # The n-gram tokens are generated
            private$generate_ngram_tokens()
            # The tp data is generated
            private$generate_tp_data()
            # The model is saved
            private$save_model()
            # The information message is shown
            private$dh("DONE", "=", md = 1)
        }
    ),
    private = list(
        # @field m The model object.
        m = NULL,

        # @description
        # Saves the model to a file
        save_model = function() {
            # The directory path
            dir <- private$m$get_config("dir")
            # The model file name
            ofn <- private$m$get_config("fn")
            # The output  file path
            ofp <- paste0(dir, "/", ofn)
            # The information message is shown
            private$dh("Saving model", "-", md = 1)
            # The model object is loaded
            private$m$load_model()
            # The model object is saved to the models folder using the output
            # file name
            private$save_obj(private$m, ofp)
        },

        # @description
        # Generates a cleaned sample file of given size from the
        # given input data file. The name of the output file is
        # sample-clean.txt.
        generate_sample = function() {
            # The input data file name
            df <- private$m$get_config("df")
            # The sample size
            ssize <- private$m$get_config("ssize")
            # The directory path
            dir <- private$m$get_config("dir")
            # The data cleaning options
            dc_opts <- private$m$get_config("dc_opts")
            # If the output file name is not set
            if (is.null(dc_opts[["output_file"]])) {
                # The output file name
                dc_opts[["output_file"]] <- paste0(dir, "/sample-clean.txt")
            }
            # The DataSampler object is created
            ds <- DataSampler$new(dir = dir, ve = private$ve)
            # Sample is taken and cleaned
            ds$generate_sample(df, ssize, T, F, "sample.txt", T, dc_opts)
        },

        # @description
        # Generates test, train and validation files from the
        # cleaned sample file. The name of the output files are train.txt,
        # test.txt and validation.txt.
        generate_data_files = function() {
            # The directory path
            dir <- private$m$get_config("dir")
            # The DataSampler object is created
            ds <- DataSampler$new(dir = dir, ve = private$ve)
            # The training, testing and validation data sets are generated
            ds$generate_data("sample-clean.txt", list(
                train = .8,
                test = .1,
                validate = .1
            ))
        },

        # @description
        # Generates transition probabilities data from n-gram token
        # file. The transition probabilties data is saved as files.
        generate_tp_data = function() {
            # The n-gram number
            n <- private$m$get_config("n")
            # The directory path
            dir <- private$m$get_config("dir")
            # The options for generating combined transition probabilities
            tp_opts <- list(
                "n" = n,
                "save_tp" = T,
                "format" = "obj",
                "dir" = dir
            )
            # The TPGenerator object is created
            tp <- TPGenerator$new(tp_opts, private$ve)
            # The transition probabilities are generated
            tp$generate_tp()
        },

        # @description
        # Generates n-gram tokens from the cleaned data input file.
        # The n-gram tokens are saved as files.
        generate_ngram_tokens = function() {
            # The n-gram number
            n <- private$m$get_config("n")
            # The directory path
            dir <- private$m$get_config("dir")
            # The TokenGenerator object options
            tg_opts <- private$m$get_config("tg_opts")
            # The directory is set
            tg_opts$dir <- dir
            # The clean train data file name
            fn <- paste0(dir, "/train.txt")
            # For each n-gram number, the n-gram token file is generated
            for (i in 1:n) {
                # The n-gram number is set
                tg_opts$n <- i
                # The TokenGenerator object is created
                tg <- TokenGenerator$new(fn, tg_opts, private$ve)
                # The n-gram tokens are generated
                tg$generate_tokens()
            }
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.