R/token-generator.R

#' Generates n-grams from text files
#'
#' @description
#' It generates n-gram tokens along with their frequencies. The data
#' may be saved to a file in plain text format or as a R object.
#'
#' @importFrom SnowballC wordStem
#' @importFrom dplyr group_by summarize_all %>%
TokenGenerator <- R6::R6Class(
    "TokenGenerator",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current obj. It is used to set the file name,
        #' tokenization options and verbose option.
        #' @param fn The path to the input file.
        #' @param opts The options for generating the n-gram tokens.
        #' * **n**. The n-gram size.
        #' * **save_ngrams**. If the n-gram data should be saved.
        #' * **min_freq**. All n-grams with frequency less than min_freq are
        #'     ignored.
        #' * **line_count**. The number of lines to process at a time.
        #' * **stem_words**. If words should be transformed to their stems.
        #' * **dir**. The dir where the output file should be saved.
        #' * **format**. The format for the output. There are two options.
        #'     * **plain**. The data is stored in plain text.
        #'     * **obj**. The data is stored as a R obj.
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(fn = NULL, opts = list(), ve = 0) {
            # The given options are merged with the opts attribute
            private$tg_opts <- modifyList(private$tg_opts, opts)
            # The base class is initialized
            super$initialize(fn, private$tg_opts$line_count, ve)
            # The processed output is initialized
            private$p_output <- NULL
        },

        #' @description
        #' It generates n-gram tokens and their frequencies from the
        #' given file name. The tokens may be saved to a text file as plain text
        #' or a R object.
        #' @return The data frame containing n-gram tokens along with their
        #'   frequencies.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("test-clean.txt")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The n-gram size
        #' n <- 4
        #' # The test file name
        #' tfn <- paste0(ed, "/test-clean.txt")
        #' # The n-gram number is set
        #' tg_opts <- list("n" = n, "save_ngrams" = TRUE, "dir" = ed)
        #' # The TokenGenerator object is created
        #' tg <- TokenGenerator$new(tfn, tg_opts, ve = ve)
        #' # The n-gram tokens are generated
        #' tg$generate_tokens()
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        generate_tokens = function() {
            # The processed output is initialized
            private$p_output <- NULL
            # The output file name
            fn <- private$get_file_name()
            # If the output file already exists
            if (file.exists(fn)) {
                # The information message is shown
                private$dm("The ",
                    private$tg_opts[["n"]],
                    "-gram file already exists\n",
                    md = 1,
                    ty = "w"
                )
                # If the n-gram data should not be saved
                if (!private$tg_opts[["save_ngrams"]]) {
                    # The n-grams file is read
                    private$p_output <- private$read_data(
                        fn, private$tg_opts[["format"]], T
                    )
                }
            }
            else {
                # The information message
                msg <- paste0("Generating ", private$tg_opts[["n"]])
                msg <- paste0(msg, "-gram tokens")
                # The information message is shown
                private$dh(msg, "-", md = 1)
                # The base class process_file function is called
                private$process_file(
                    private$pre_process, private$process,
                    private$post_process
                )
                # The information message is shown
                private$dh("DONE", "=", md = 1)
            }
        }
    ),
    private = list(
        # @field tg_opts The options for the token generator obj.
        # * **n**. The n-gram size.
        # * **save_ngrams**. If the n-gram data should be saved.
        # * **min_freq**. All n-grams with frequency less than min_freq are
        #     ignored.
        # * **stem_words**. If words should be transformed to their stems.
        # * **line_count**. The number of lines to process at a time.
        # * **dir**. The dir where the output file should be saved.
        # * **format**. The format for the output. There are two options.
        #     * **plain**. The data is stored in plain text.
        #     * **obj**. The data is stored as a R obj.
        tg_opts = list(
            "n" = 1,
            "save_ngrams" = F,
            "stem_words" = F,
            "min_freq" = -1,
            "line_count" = 5000,
            "dir" = NULL,
            "format" = "obj"
        ),

        # @description
        # Performs processing for the generate_tokens function. It
        # processes the given line of text. It converts each line of text into
        # n-grams of the given size. The frequency of each n-gram is updated.
        # @param lines The lines of text.
        process = function(lines) {
            # n-grams are extracted from each line
            ngrams <- private$generate_ngrams(lines)
            # The n-gram words are appended to the processed output
            private$p_output <- c(private$p_output, ngrams)
        },

        # @description
        # It returns the name of the output n-gram file.
        get_file_name = function() {
            # The n-gram number
            n <- private$tg_opts[["n"]]
            # The format
            fo <- private$tg_opts[["format"]]
            # The output directory
            dir <- private$tg_opts[["dir"]]
            # The file extension
            if (fo == "plain") {
                ext <- ".txt"
            } else {
                ext <- ".RDS"
            }

            # The file name
            file_name <- paste0(dir, "/n", n, ext)

            return(file_name)
        },

        # @description
        # It saves the n-gram tokens and their frequencies to a text file.
        post_process = function() {
            # The information message
            msg <- paste0("Calculating ", private$tg_opts[["n"]])
            msg <- paste0(msg, "-gram frequencies")
            # The information message is shown
            private$dm(msg, md = 1)
            # The output is copied to a variable
            df <- data.frame("pre" = private$p_output)
            # A frequency column is added
            df$freq <- 1
            # Each prefix is grouped and summed
            df <- df %>%
                group_by(pre) %>%
                summarize_all(sum)
            # The information message is shown
            private$dm(" \u2714\n", md = 1)
            # If the minimum n-gram frequency is given
            if (private$tg_opts[["min_freq"]] > -1) {
                # The information message is shown
                private$dm("Removing low frequency n-grams", md = 1)
                # All n-grams with frequency less than min_freq are ignored
                df <- df[df$freq >= private$tg_opts[["min_freq"]], ]
                # The information message is shown
                private$dm(" \u2714\n", md = 1)
            }
            # The column names are set
            colnames(df) <- c("pre", "freq")
            # The output is set to the updated variable
            private$p_output <- df
            # If the n-gram data should be saved
            if (private$tg_opts[["save_ngrams"]]) {
                # The required file name
                fn <- private$get_file_name()
                # The format
                fo <- private$tg_opts[["format"]]
                # The n-gram data frame is written to file
                private$write_data(private$p_output, fn, fo, F)
            }
            # If n-gram data should not be saved
            else {
                return(private$p_output)
            }
        },

        # @description
        # It generates n-gram frequencies for the given lines of text.
        # @param lines The lines of text to process
        generate_ngrams = function(lines) {
            # The n-gram number
            n <- private$tg_opts[["n"]]
            # If n > 1
            if (n > 1) {
                # Trailing and leading white space is removed
                l <- trimws(lines, "both")
                # Start and end of sentence tags are added
                l <- gsub("(^)(.+)($)", "<s>\\2<e>", l)
                # The lines are split on space
                w <- strsplit(l, " ")
                # The words are converted to an atomic vector
                w <- unlist(w)
                # The index of empty words
                i <- (w == "")
                # The empty words are removed
                w <- w[!i]
                # The indexes for the words
                indexes <- seq(length(w))
                # The n-grams are generated
                l <- sapply(indexes, function(i) {
                    # If the words should be stemmed
                    if (private$tg_opts[["stem_words"]]) {
                        # The n-gram prefix words are stemmed. The next word is
                        # not stemmed
                        v <- c(wordStem(w[i:(i + n - 2)]), w[(i + n - 1)])
                    }
                    else {
                        # The n-gram token
                        v <- w[i:(i + n - 1)]
                    }
                    # The n-gram token
                    v <- paste0(v, collapse = "_")
                    # The n-gram token is returned
                    return(v)
                },
                simplify = T
                )
                # Invalid n-grams need to be removed
                # A logical vector indicating position of invalid n-grams
                i <- grepl(".+<e>.+", l)
                # The list of valid n-grams
                l <- l[!i]
                # The start of sentence tokens are removed
                l <- gsub("<s>", "", l)
                # The end of sentence tokens are removed
                l <- gsub("<e>", "", l)
            }
            else {
                # The line is split on " "
                words <- strsplit(lines, " ")
                # The list of words is converted to atomic vector
                l <- unlist(words)
                # The index of empty words
                i <- l == ""
                # The empty words are removed
                l <- l[!i]
            }

            return(l)
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.