R/model-predictor.R

#' Allows predicting text, calculating word probabilities and Perplexity
#'
#' @description
#' It provides a method for predicting the new word given a set of
#' previous words. It also provides a method for calculating the Perplexity
#' score for a set of words. Furthermore it provides a method for calculating
#' the probability of a given word and set of previous words.
#' @importFrom digest digest2int
#' @importFrom SnowballC wordStem
ModelPredictor <- R6::R6Class(
    "ModelPredictor",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current object. It is used to set the
        #' model file name and verbose options.
        #' @param mf The model file name.
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(mf, ve = 0) {
            # The base class is initialized
            super$initialize(NULL, NULL, ve)
            # If the model file name is not valid, then an error is thrown
            if (!file.exists(mf)) {
                private$dm("Invalid model file: ", mf, md = -1, ty = "e")
            } else {
                # The model object is read
                private$m <- private$read_obj(mf)
            }
        },

        #' @description
        #' Returns the Model class object.
        #' @return The Model class object is returned.
        get_model = function() {
            # The model object is returned
            return(private$m)
        },

        #' @description
        #' The Perplexity for the given sentence is calculated. For
        #' each word, the probability of the word given the previous words is
        #' calculated. The probabilities are multiplied and then inverted. The
        #' nth root of the result is the perplexity, where n is the number of
        #' words in the sentence. If the stem_words tokenization option was
        #' specified when creating the given model file, then the previous words
        #' are converted to their stems.
        #' @param words The list of words.
        #' @return The perplexity of the given list of words.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("def-model.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The model file name
        #' mfn <- paste0(ed, "/def-model.RDS")
        #' # ModelPredictor class object is created
        #' mp <- ModelPredictor$new(mf = mfn, ve = ve)
        #' # The sentence whoose Perplexity is to be calculated
        #' l <- "last year at this time i was preparing for a trip to rome"
        #' # The line is split in to words
        #' w <- strsplit(l, " ")[[1]]
        #' # The Perplexity of the sentence is calculated
        #' p <- mp$calc_perplexity(w)
        #' # The sentence Perplexity is printed
        #' print(p)
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        calc_perplexity = function(words) {
            # The model size
            n <- private$m$get_config("n")
            # The options for token generation
            tg_opts <- private$m$get_config("tg_opts")

            # The number of words in the sentence
            wl <- length(words)
            # The product of the word probabilities
            prob_prod <- 1
            # For each word, the probability of the word is calculated
            for (i in 1:wl) {
                # The word
                word <- words[i]
                # The list of previous words
                pw <- NULL
                # If i is more than 1
                if (i > 1) {
                    # The start index
                    start <- 1
                    # If i > self$model
                    if (i > n) start <- i - (n - 1)
                    # The list of previous words
                    pw <- words[start:(i - 1)]
                    # If the words should be stemmed
                    if (tg_opts[["stem_words"]]) {
                        # The previous words are stemmed
                        pw <- wordStem(pw)
                    }
                }
                # The word probability
                prob <- self$get_word_prob(word, pw)
                # The probability product is updated
                prob_prod <- prob_prod * prob
            }
            # The inverse of the number of words
            iwl <- 1 / wl
            # The nth root of the inverse of the probability product is taken
            p <- (1 / prob_prod)
            p <- p^iwl
            p <- round(p)

            return(p)
        },

        #' @description
        #' Predicts the next word given a list of previous words. It
        #' checks the last n previous words in the transition probabilities
        #' data, where n is equal to 1 - n-gram size of model. If there is a
        #' match, the top 3 next words with highest probabilities are returned.
        #' If there is no match, then the last n-1 previous words are checked.
        #' This process is continued until the last word is checked. If there is
        #' no match, then empty result is returned. The given words may
        #' optionally be stemmed.
        #' @param words A character vector of previous words or a single vector
        #'   containing the previous word text.
        #' @param count The number of results to return.
        #' @param dc A DataCleaner object. If it is given, then the given words
        #   are cleaned
        #' @return The top 3 predicted words along with their probabilities.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("def-model.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, "rp" = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The model file name
        #' mfn <- paste0(ed, "/def-model.RDS")
        #' # ModelPredictor class object is created
        #' mp <- ModelPredictor$new(mf = mfn, ve = ve)
        #' # The next word is predicted
        #' nws <- mp$predict_word("today is", count = 10)
        #' # The predicted next words are printed
        #' print(nws)
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        predict_word = function(words, count = 3, dc = NULL) {
            # The tp data is fetched from the model object
            tp <- private$m$get_config("tp")
            # The loop counter
            c <- 1
            # The required results
            result <- list("found" = F, "words" = "", "probs" = "")
            # The previous words are fetched
            pw <- private$get_prev_words(words, dc)
            # If the previous words are NULL
            if (is.null(pw)) {
                return(result)
            }
            # The length of previous words
            pwl <- length(pw)
            # Each n-gram in the previous word list is checked starting from
            # largest n-gram
            for (i in pwl:1) {
                # The previous words to check
                tpw <- pw[c:pwl]
                # The key to use for the transition probabilities data
                k <- paste(tpw, collapse = "_")
                # The key is converted to a numeric hash
                h <- digest2int(k)
                # The transition probabilities data is checked
                res <- tp[tp$pre == h, ]
                # The results are checked
                result <- private$check_results(res, count, k)
                # If the data was found
                if (result[["found"]]) break
                # Information message is shown
                private$dm("Backing off to ", (i), "-gram\n", md = 3)
                # The counter is increased by 1
                c <- c + 1
            }

            return(result)
        },

        #' @description
        #' Calculates the probability of the given word given the
        #' previous words. The last n words are converted to numeric hash using
        #' digest2int function. All other words are ignored. n is equal to 1 -
        #' size of the n-gram model. The hash is looked up in a data frame of
        #' transition probabilities. The last word is converted to a number by
        #' checking its position in a list of unique words. If the hash and the
        #' word position were found, then the probability of the previous word
        #' and hash is returned. If it was not found, then the hash of the n-1
        #' previous words is taken and the processed is repeated. If the data
        #' was not found in the data frame, then the word probability is
        #' returned. This is known as back-off. If the word probability could
        #' not be found then the default probability is returned. The default
        #' probability is calculated as 1/(N+V), Where N = number of words in
        #' corpus and V is the number of dictionary words.
        #' @param word The word whose probability is to be calculated.
        #' @param pw The previous words.
        #' @return The probability of the word given the previous words.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("def-model.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, "rp" = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The model file name
        #' mfn <- paste0(ed, "/def-model.RDS")
        #' # ModelPredictor class object is created
        #' mp <- ModelPredictor$new(mf = mfn, ve = ve)
        #' # The probability that the next word is "you" given the prev words
        #' # "how" and "are"
        #' prob <- mp$get_word_prob(word = "you", pw = c("how", "are"))
        #' # The probability is printed
        #' print(prob)
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        get_word_prob = function(word, pw) {
            # The tp data is fetched from the model object
            tp <- private$m$get_config("tp")
            # The word list data is fetched from the model object
            wl <- private$m$get_config("wl")
            # The default probability is fetched from the model object
            dp <- private$m$get_config("dp")
            # If the default probability is not set, then an error is raised
            if (is.null(dp)) {
                private$dm(
                    "The default probability is not set in the model file !",
                    md = -1,
                    ty = "e"
                )
            }
            # The length of previous words
            pwl <- length(pw)
            # The probability of the word given the previous words. It is
            # initialized to the default probability, which should be 1/(N+V)
            prob <- dp
            # The loop counter
            c <- 1
            # Indicates if the word was found
            found <- FALSE
            # The next word id
            nw <- match(word, wl$pre)
            # If the next word was not found
            if (is.na(nw)) {
                # Information message is shown
                private$dm(
                    "The next word: ", word, " was not found\n",
                    md = 3
                )
                # The default probability is returned
                return(prob)
            }
            # If the previous word count is 0
            if (pwl == 0) {
                return(prob)
            }

            # The previous words are checked
            for (i in pwl:1) {
                # The previous words to check
                tpw <- pw[c:pwl]
                # The key to use for the transition matrix
                k <- paste(tpw, collapse = "_")
                # The key is converted to a numeric hash
                h <- digest2int(k)
                # The transition probabilities data is checked
                res <- tp[tp$pre == h & tp$nw == nw, ]
                # If the prefix was found
                if (nrow(res) > 0) {
                    # The word was found
                    found <- TRUE
                    # The probability is set
                    prob <- as.numeric(res$prob)
                    # The information message
                    private$dm(
                        "The n-gram key: ", k,
                        " and the next word: ", word, " were found\n",
                        md = 3
                    )
                    # The loop ends
                    break
                }
                else {
                    # The information message
                    private$dm(
                        "The n-gram key: ", k,
                        " and the next word: ", word, " were not found\n",
                        md = 3
                    )
                }
                # Information message is shown
                private$dm("Backing off to ", (i), "-gram\n", md = 3)
                # The counter is increased by 1
                c <- c + 1
            }

            # If the word was not found then the probability of the word is
            # checked in the n1-gram
            if (!found) {
                # If the word was not found
                if (sum(wl$pre == word) == 0) {
                    # Information message is shown
                    private$dm("Using default probability\n", md = 3)
                }
                else {
                    # The word probability
                    prob <- as.numeric(wl[wl$pre == word, "prob"])
                }
            }

            return(prob)
        }
    ),
    private = list(
        # @field m The model object.
        m = NULL,
        # @description
        # Fetches the list of previous words from the given list of words.
        # @param words A character vector of previous words or a single vector
        #   containing the previous word text.
        # @param dc A DataCleaner object. If it is given, then the given words
        #   are cleaned.
        # @return The list of previous words.
        get_prev_words = function(words, dc) {
            # The options for token generation
            tg_opts <- private$m$get_config("tg_opts")
            # The words are assigned to temp variable
            w <- words
            # If the DataCleaner obj was specified
            if (!is.null(dc)) {
                # If the words is a set of vectors
                if (length(w) > 1) {
                    # The words are converted to a single line of text
                    w <- paste0(w, collapse = " ")
                }
                # The words are cleaned
                w <- dc$clean_lines(w)
            }

            # If the words should be stemmed
            if (tg_opts[["stem_words"]]) {
                # The previous words are stemmed
                w <- wordStem(w)
            }

            # If the words are in the form of a line
            if (length(w) == 1) {
                # The words are split on space
                w <- strsplit(w, " ")[[1]]
            }

            # The length of previous words
            pwl <- length(w)
            # If the previous words length is 0
            if (pwl == 0) {
                return(NULL)
            }
            # If the previous word length is more than 3
            if (pwl > 3) {
                # The last 3 words are extracted.
                pw <- w[(pwl - 2):pwl]
            }
            else {
                pw <- w
            }
        },
        # @description
        # Checks the result from the tp table
        # @param res The rows from the combined tp table.
        # @param count The number of results to return.
        # @param k The key string used to search the tp table.
        # @return The results of checking tp table.
        check_results = function(res, count, k) {
            # The word list data is fetched from the model object
            wl <- private$m$get_config("wl")
            # The word was found
            found <- FALSE
            # The required results
            result <- list("found" = F, "words" = "", "probs" = "")
            # If the prefix was found
            if (nrow(res) > 0) {
                # The word was found
                found <- TRUE
                # The result is sorted by probability
                sres <- res[order(res$prob, decreasing = T), ]
                # The number of rows in the result set
                rcount <- nrow(sres)
                # If the number of results is more than the required number
                # of results
                if (rcount > count) {
                    # The result count is set to the required number of
                    # results
                    rc <- count
                }
                else {
                    # The result count is set to the number of results
                    rc <- rcount
                }
                # The required word probabilities
                probs <- sres$prob[1:rc]
                # The next words indexes
                ind <- sres$nw[1:rc]
                # The required words
                nw <- as.character(wl$pre[ind])

                # The result is updated
                result[["words"]] <- nw
                result[["probs"]] <- probs
                result[["found"]] <- T
                # Information message is shown
                private$dm("The n-gram key: ", k, " was found\n", md = 3)
            }
            else {
                private$dm("The n-gram key: ", k, " was not found\n", md = 3)
                # The result is updated
                result[["found"]] <- F
            }
            return(result)
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.