R/tp-generator.R

#' Generates transition probabilities for n-grams
#'
#' @description
#' It provides a method for generating transition probabilities for
#' the given n-gram size. It also provides a method for generating the combined
#' transition probabilities data for n-gram sizes from 1 to the given size. The
#' combined transition probabilities data can be used to implement back-off.
#'
#' @details
#' It provides a method for generating n-gram transition probabilities.
#' It reads n-gram frequencies from an input text file that is generated by the
#' TokenGenerator class.
#'
#' It parses each n-gram into a prefix, a next word, the next word frequency and
#' the next word probability. Maximum Likelihood count is used to generate the
#' next word probabilities.
#'
#' Each n-gram prefix is converted to a numeric hash using the digest2int
#' function. The next word is replaced with the position of the next word in the
#' list of all words. The transition probabilities data is stored as a dataframe
#' in a file.
#'
#' Another method is provided that combines the transition probabilities for
#' n-grams of size 1 to the given size. The combined transition probabilities
#' can be saved to a file as a data frame. This file may be regarded as a
#' completed self contained n-gram model. By combining the transition
#' probabilities of n-grams, back-off may be used to evaluate word probabilities
#' or predict the next word.
#' @importFrom stringr str_match
#' @importFrom digest digest2int
#' @importFrom dplyr group_by mutate
TPGenerator <- R6::R6Class(
    "TPGenerator",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current obj. It is used to set the
        #' transition probabilities options and verbose option.
        #' @param opts The options for generating the transition probabilities.
        #' * **save_tp**. If the data should be saved.
        #' * **n**. The n-gram size.
        #' * **dir**. The directory containing the input and output files.
        #' * **format**. The format for the output. There are two options.
        #'     * **plain**. The data is stored in plain text.
        #'     * **obj**. The data is stored as a R obj.
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(opts = list(), ve = 0) {
            # The given options are merged with the opts attribute
            private$tp_opts <- modifyList(private$tp_opts, opts)
            # The base class is initialized
            super$initialize(NULL, NULL, ve)
            # The processed output is initialized
            private$p_output <- data.frame()
        },

        #' @description
        #' It first generates the transition probabilities for each
        #' n-gram of size from 1 to the given size. The transition probabilities
        #' are then combined into a single data frame and saved to the output
        #' folder that is given as parameter to the current object.
        #'
        #' By combining the transition probabilities for all n-gram sizes from 1
        #' to n, back-off can be used to calculate next word probabilities or
        #' predict the next word.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("n1.RDS", "n2.RDS", "n3.RDS", "n4.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The list of output files
        #' fns <- c("words", "model-4", "tp2", "tp3", "tp4")
        #'
        #' # The TPGenerator object is created
        #' tp <- TPGenerator$new(opts = list(n = 4, dir = ed), ve = ve)
        #' # The combined transition probabilities are generated
        #' tp$generate_tp()
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        generate_tp = function() {
            # The information message
            msg <- paste0("Generating Transition Probabilities for n = ")
            msg <- paste0(msg, "1:", private$tp_opts[["n"]])
            # Information message is shown
            private$dh(msg, "-", md = 1)
            # The processed output is cleared
            private$p_output <- data.frame()
            # The output format
            fo <- private$tp_opts[["format"]]
            # The n-gram number
            nmax <- private$tp_opts[["n"]]
            # The file extension
            if (fo == "plain") {
                ext <- ".txt"
            } else {
                ext <- ".RDS"
            }

            # The short output file name
            fn <- paste0("model-", nmax, ext)
            # The model file name path
            fp <- paste0(private$tp_opts[["dir"]], "/", fn)
            # If the combined tp file already exists
            if (file.exists(fp)) {
                # Information message is shown
                private$dm(
                    "The output file: ", fp, " already exists\n",
                    md = 1, ty = "w"
                )
            }
            else {
                # The options for generating transition probabilities
                tp_opts <- list(
                    n = 1,
                    format = fo,
                    save_tp = T,
                    dir = private$tp_opts[["dir"]]
                )
                # The combined tp data
                c_pre <- c_nw <- c_prob <- c()
                # For each n-gram number, the transition probabilities data is
                # generated.
                for (n in 1:nmax) {
                    # The value of n is set
                    tp_opts$n <- n
                    # The transition probabilities or word list is generated
                    self$generate_tp_for_n(n)
                    # If n > 1
                    if (n > 1) {
                        # c_pre is updated
                        c_pre <- c(c_pre, private$p_output$pre)
                        # c_nw is updated
                        c_nw <- c(c_nw, private$p_output$nw)
                        # c_prob is updated
                        c_prob <- c(c_prob, private$p_output$prob)
                        # The processed output is cleared
                        private$p_output <- data.frame()
                    }
                }
                # The processed output is set to the combined tp data
                private$p_output <-
                    data.frame(
                        "pre" = c_pre,
                        "nw" = c_nw,
                        "prob" = c_prob
                    )

                # If the data should be saved
                if (private$tp_opts[["save_tp"]]) {
                    private$save_data(fn)
                }
                # Information message is shown
                private$dh("DONE", "=", md = 1)
            }
        },

        #' @description
        #' It generates the transition probabilities table for the
        #' given n-gram size. It first reads n-gram token frequencies from an
        #' input text file.
        #'
        #' It then generates a data frame whose columns are the
        #' n-gram prefix, next word and next word frequency. The data frame may
        #' be saved to a file as plain text or as a R obj. If n = 1, then the
        #' list of words is saved.
        #' @param n The n-gram size for which the tp data is generated.
        generate_tp_for_n = function(n) {
            # The n value is set
            private$tp_opts[["n"]] <- n
            # The output format
            fo <- private$tp_opts[["format"]]
            # The output file name
            fn <- private$get_file_name(T)
            # If the output file already exists
            if (file.exists(fn)) {
                # The information message is shown
                private$dm(
                    "The file: ", fn, " already exists",
                    md = 1, ty = "w"
                )
                # The file is read
                data <- private$read_data(fn, fo, T)
                # If n = 1
                if (n == 1) {
                    # The word list is set to the data
                    private$wl <- data
                }
                else {
                    # The processed output is set to the data
                    private$p_output <- data
                }
            }
            else {
                # The information message
                msg <- paste0(
                    "Generating transition probabilities for n = ", n)
                # Information message is shown
                private$dh(msg, "-", md = 1)

                # The input file name
                private$fn <- private$get_file_name(F)
                # The data is read
                df <- private$read_data(private$fn, fo, T)
                # If n = 1
                if (n == 1) {
                    # The word list is set to the data frame
                    private$wl <- df
                    # A probabilities column is added
                    private$wl$prob <- (private$wl$freq / sum(private$wl$freq))
                    # The probabilities are rounded to 8 decimal places
                    private$wl$prob <- round(private$wl$prob, 8)
                    # The frequency column is removed
                    private$wl$freq <- NULL
                }
                else {
                    # The 1-gram words are read
                    private$read_words()
                    # The lines are split on "prefix_nextword:frequency"
                    m <- str_match(df$pre, "(.+)_(.+)")
                    # The hash of the prefix is taken
                    np <- digest2int(m[, 2])
                    # The next word id based on index position
                    nw <- match(m[, 3], private$wl$pre)
                    # The next word frequencies
                    nf <- df$freq
                    # The data is added to a data frame
                    df <- data.frame(
                        "pre" = np,
                        "nw" = nw,
                        "freq" = nf
                    )
                    # The processed output is set to the data frame
                    private$p_output <- df
                    # The next word probabilities are generated
                    private$generate_probs()
                    # The frequency column is removed
                    private$p_output$freq <- NULL
                }
                # If the data should be saved
                if (private$tp_opts[["save_tp"]]) {
                    private$save_data()
                }
                # Information message is shown
                private$dh("DONE", "=", md = 1)
            }
        }
    ),
    private = list(
        # @field tp_opts The options for generating the transition
        #   probabilities.
        # * **save_tp**. If the data should be saved.
        # * **n**. The n-gram number
        # * **dir**. The directory containing the input and output files.
        # * **format**. The format for the output. There are two options.
        #     * **plain**. The data is stored in plain text.
        #     * **obj**. The data is stored as a R obj.
        tp_opts = list(
            "save_tp" = T,
            "n" = 1,
            "dir" = "./data/model",
            "format" = "obj"
        ),

        # @field The list of unique words and their frequencies
        wl = data.frame(),

        # @description
        # It calculates the next word probabilities and optionally
        # saves the transition probability data to a file.
        generate_probs = function() {
            # The n-gram number
            n <- private$tp_opts[["n"]]
            # If n > 1
            if (n > 1) {
                # The output is copied to a variable
                df <- private$p_output
                # A new probability column is added. It is set to the sum of
                # frequency column for each prefix group.
                df <- df %>%
                    group_by(pre) %>%
                    mutate(prob = sum(freq))
                # Each frequency is divided by the sum to give the probability.
                df$prob <- round(df$freq / df$prob, 8)
                # The output is set to the updated variable
                private$p_output <- df
            }
        },

        # @description
        # It returns the name of the output or input file.
        # @param is_output If the output file name is required.
        get_file_name = function(is_output) {
            # The n-gram number
            n <- private$tp_opts[["n"]]
            # The directory
            od <- private$tp_opts[["dir"]]
            # The format
            fo <- private$tp_opts[["format"]]
            # The file extension
            if (fo == "plain") {
                ext <- ".txt"
            } else {
                ext <- ".RDS"
            }
            # If the output file name is required
            if (is_output) {
                # If n = 1
                if (n == 1) {
                    # The file name
                    fn <- paste0(od, "/words", ext)
                }
                # If n > 1
                else if (n > 1) {
                    # The file name
                    fn <- paste0(od, "/tp", n, ext)
                }
            }
            else {
                # The file name
                fn <- paste0(od, "/n", n, ext)
            }

            return(fn)
        },

        # @description
        # It saves the transition probabilities to a file in plain format or as
        # a R obj. If the file name is not given, then it is generated using the
        # current object attributes.
        # @param fn The file name to use.
        save_data = function(fn = NULL) {
            # The n-gram number
            n <- private$tp_opts[["n"]]
            # The directory
            od <- private$tp_opts[["dir"]]
            # The format
            fo <- private$tp_opts[["format"]]
            # If n = 1
            if (n == 1) {
                # The data to save
                data <- private$wl
            }
            # If n > 1
            else if (n > 1) {
                # The data to save
                data <- private$p_output
            }
            # If the file name is given as parameter then it is used
            if (!is.null(fn)) {
                fn <- paste0(od, "/", fn)
            } else {
                fn <- private$get_file_name(T)
            }
            # The data is written
            private$write_data(data, fn, fo, F)
        },

        # @description
        # It reads the list of 1-gram words.
        read_words = function() {
            # If the word list has not been read
            if (nrow(private$wl) == 0) {
                # The format
                fo <- private$tp_opts[["format"]]
                # The file extension
                if (fo == "plain") {
                    ext <- ".txt"
                } else {
                    ext <- ".RDS"
                }
                # The 1-gram words file name
                fn <- paste0(private$tp_opts[["dir"]], "/words", ext)
                # The words are read
                private$wl <- private$read_data(
                    fn, private$tp_opts[["format"]], F
                )
            }
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.