R/data-cleaner.R

#' Provides data cleaning functionality
#'
#' @description
#' It provides a memory efficient method for removing unneeded
#' characters from text files. It is suitable for cleaning large text files.
#'
#' @details
#' It provides a method for cleaning text files. It allows removing bad words,
#' stop words, non dictionary words, extra space, punctuation and non-alphabet
#' characters. It also allows conversion to lower case. It supports large text
#' files.
#'
#' @importFrom stringr str_count boundary
DataCleaner <- R6::R6Class(
    "DataCleaner",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current object. It is used to set the file name
        #' and verbose options.
        #' @param fn The path to the file to clean.
        #' @param opts The options for data cleaning.
        #' * **min_words**. The minimum number of words per sentence.
        #' * **line_count**. The number of lines to read and clean at a time.
        #' * **save_data**. If the combined processed lines should be saved.
        #' * **output_file**. Name of the output file used to store the data.
        #' * **sw_file**. The stop words file path.
        #' * **dict_file**. The dictionary file path.
        #' * **bad_file**. The bad words file path.
        #' * **to_lower**. If the words should be converted to lower case.
        #' * **remove_stop**. If stop words should be removed.
        #' * **remove_punct**. If punctuation symbols should be removed.
        #' * **remove_non_dict**. If non dictionary words should be removed.
        #' * **remove_non_alpha**. -> If non alphabet symbols should be removed.
        #' * **remove_extra_space**. -> If leading, trailing and double spaces
        #'     should be removed.
        #' * **remove_bad**. If bad words should be removed
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(fn = NULL, opts = list(), ve = 0) {

            # An object of class EnvManager is created
            em <- EnvManager$new(ve)
            # The stop words file is checked
            opts[["sw_file"]] <- em$get_data_fn(
                opts[["sw_file"]], "stop-words.txt"
            )
            # The bad words file is checked
            opts[["bad_file"]] <- em$get_data_fn(
                opts[["bad_file"]], "bad-words.txt"
            )
            # The dict words file is checked
            opts[["dict_file"]] <- em$get_data_fn(
                opts[["dict_file"]], "dict-no-bad.txt"
            )

            # The given options are merged with the opts attribute
            private$dc_opts <- modifyList(private$dc_opts, opts)

            # The base class is initialized
            super$initialize(fn, private$dc_opts[["line_count"]], ve)
            # The stop words file is read
            private$sw <- private$read_file(private$dc_opts[["sw_file"]], F)
            # The dictionary file is read
            private$dw <- private$read_file(private$dc_opts[["dict_file"]], F)
            # The bad word file is read
            private$bw <- private$read_file(private$dc_opts[["bad_file"]], F)
            # If the output file name is not given, then the default file name
            # is used. The default file name is generated by appending "-test"
            # to the input file name.
            if (!is.null(fn) && is.null(private$dc_opts[["output_file"]])) {
                # The default file name
                dfn <- gsub(".txt", "-clean.txt", fn)
                # The default file name is set
                private$dc_opts[["output_file"]] <- dfn
                # The information message
                msg <- paste0("Output file name not given.")
                msg <- paste0(msg, " Using the default file name: ", dfn, "\n")
                # The information message is shown
                private$dm(msg, md = 1, ty = "w")
            }
            # The save_data option of base class is set
            private$opts[["save_data"]] <- private$dc_opts[["save_data"]]
            # The output_file option of base class is set
            private$opts[["output_file"]] <- private$dc_opts[["output_file"]]
        },

        #' @description
        #' It provides an efficient method for cleaning text files.
        #' It removes unneeded characters from the given text file with several
        #' options.
        #'
        #' It allows removing punctuation, bad words, stop words,
        #' non-alphabetical symbols and non-dictionary words. It reads a certain
        #' number of lines from the given text file. It removes unneeded
        #' characters from the lines and then saves the lines to an output text
        #' file.
        #'
        #' File cleaning progress is displayed if the verbose option was
        #' set in the class constructor. It is suitable for cleaning large text
        #' files.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("test.txt")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The cleaned test file name
        #' cfn <- paste0(ed, "/test-clean.txt")
        #' # The test file name
        #' fn <- paste0(ed, "/test.txt")
        #' # The data cleaning options
        #' dc_opts <- list("output_file" = cfn)
        #' # The data cleaner object is created
        #' dc <- DataCleaner$new(fn, dc_opts, ve = ve)
        #' # The sample file is cleaned
        #' dc$clean_file()
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        clean_file = function() {
            # The information message is shown
            private$dh("Cleaning file", "-", md = 1)
            # The base class process_file function is called
            private$process_file(
                private$pre_process, private$process,
                private$post_process
            )
            # The information message is shown
            private$dh("DONE", "=", md = 1)
            # If the data should not be saved
            if (!private$dc_opts[["save_data"]]) {
                # The processed output is returned
                return(private$p_output)
            }
        },

        #' @description
        #' It cleans the given lines of text using the options
        #' passed to the current object.
        #' @param lines The input sentences.
        #' @return The cleaned lines of text.
        #' @examples
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # Test data is read
        #' l <- c(
        #'     "If you think I'm wrong, send me a link to where it's happened",
        #'     "We're about 90percent done with this room",
        #'     "This isn't how I wanted it between us.",
        #'     "Almost any cute breed can become ornamental",
        #'     "Once upon a time there was a kingdom with a castle",
        #'     "That's not a thing any of us are granted'",
        #'     "Why are you being so difficult? she asks."
        #' )
        #' # The expected results
        #' res <- c(
        #'     "if you think wrong send me a link to where its happened",
        #'     "were about percent done with this room",
        #'     "this how i wanted it between us",
        #'     "almost any cute breed can become ornamental",
        #'     "once upon a time there was a kingdom with a castle",
        #'     "thats not a thing any of us are granted",
        #'     "why are you being so difficult she asks"
        #' )
        #' # The DataCleaner object is created
        #' dc <- DataCleaner$new(ve = ve)
        #' # The line is cleaned
        #' cl <- dc$clean_lines(l)
        #' # The cleaned lines are printed
        #' print(cl)
        clean_lines = function(lines) {
            # The lines to clean
            l <- lines
            # If a line does not end with a ".", then "." is appended to the
            # line
            l <- gsub("(.+[^\\.])$", "\\1.", l)
            # The "." character is replaced with the string "specialdotsep"
            l <- gsub("\\.", " specialdotsep ", l)
            # If the words should be converted to lower case
            if (private$dc_opts[["to_lower"]]) {
                # The information message
                private$dm("Converting lines to lower case\n", md = 3)
                # The line is converted to lower case
                l <- tolower(l)
            }
            # If punctuation symbols should be removed
            if (private$dc_opts[["remove_punct"]]) {
                # The information message
                private$dm("Removing punctuation symbols\n", md = 3)
                # The pattern for removing all punctuation symbols
                l <- gsub("[[:punct:]\u2026\u2019\u201c\u201d]", "", l)
            }
            # If non alphabet symbols should be removed
            if (private$dc_opts[["remove_non_alpha"]]) {
                # The information message
                private$dm("Removing non alphabet symbols\n", md = 3)
                # Words containing non alphabetical characters are removed
                l <- gsub("([^[:alpha:]\\s])", "", l, perl = T)
            }

            # If stop words should be removed
            if (private$dc_opts[["remove_stop"]]) {
                # The information message
                private$dm("Removing stop words\n", md = 3)
                # Stop words are collapsed
                sw <- paste(private$sw, collapse = "|")
                swp <- paste("\\b(", sw, ")\\b", sep = "")
                # The stop words are removed
                l <- gsub(swp, "", l)
            }

            # The words in the lines are extracted
            words <- strsplit(l, split = " ")
            # The words are converted to an atomic list
            words <- unlist(words)
            # If non dictionary words should be removed
            if (private$dc_opts[["remove_non_dict"]]) {
                # The information message
                private$dm("Removing non dictionary words\n", md = 3)
                # The "specialdotsep" string is added to list of dictionary
                # words
                dw <- c(private$dw, "specialdotsep")
                # The non dictionary words are removed from the data
                words <- words[words %in% dw]
                # All 1 length words except for 'a' and 'i' are removed
                # The indexes position of all words that are "a" or "i"
                i1 <- (words == "a" | words == "i")
                # The index position of words of length 2 or more
                i2 <- (nchar(words) > 1)
                # The list of all words of length 2 or more including "a" and
                # "i"
                words <- words[i1 | i2]
            }
            # If bad words should be removed
            if (private$dc_opts[["remove_bad"]]) {
                # The information message
                private$dm("Removing bad words\n", md = 3)
                # The "specialdotsep" string is added to list of bad words
                bw <- c(private$bw, "specialdotsep")
                # The bad words are removed from the data
                words <- words[!words %in% bw]
            }
            # The words are combined with space
            l <- paste(words, collapse = " ")
            # The "specialdotsep" string is replaced with "."
            l <- gsub("specialdotsep", ".", l)
            # The sentences in the lines are extracted
            l <- strsplit(l, split = "\\.")
            # The sentences are converted to an atomic list
            l <- unlist(l)
            # If each sentence should have a minimum number of words
            if (private$dc_opts[["min_words"]] > -1) {
                # The information message
                msg <- paste0("Removing lines that have less than ")
                msg <- paste0(msg, private$dc_opts[["min_words"]], " words\n")
                # The information message
                private$dm(msg, md = 3)
                # The number of words in each sentence
                wc <- str_count(l, pattern = boundary("word"))
                # The lines containing less than min_words number of words are
                # removed
                l <- l[wc >= private$dc_opts[["min_words"]]]
            }

            # Consecutive 'a' and 'i' are replaced with single 'a' or 'i'
            l <- gsub("(a\\s){2,}", "\\1 ", l)
            l <- gsub("(i\\s){2,}", "\\1 ", l)
            l <- gsub("a$", "", l)
            # If extra spaces should be removed
            if (private$dc_opts[["remove_extra_space"]]) {
                # The information message
                private$dm("Removing extra spaces\n", md = 3)
                # Multiple spaces are replaced by single space
                l <- gsub("\\s{2,}", " ", l)
                # Leading and trailing whitespaces are removed
                l <- trimws(l)
            }

            return(l)
        }
    ),
    private = list(
        # @field dc_opts The options for the data cleaner object.
        # * **min_words**. The minimum number of words per sentence.
        # * **line_count**. The number of lines to read and clean at a time.
        # * **save_data**. If the combined processed lines should be saved.
        # * **output_file**. Name of the output file used to store the data.
        # * **sw_file**. The stop words file path.
        # * **dict_file**. The dictionary file path.
        # * **bad_file**. The bad words file path.
        # * **to_lower**. If the words should be converted to lower case.
        # * **remove_stop**. If stop words should be removed.
        # * **remove_punct**. If punctuation symbols should be removed.
        # * **remove_non_dict**. If non dictionary words should be removed.
        # * **remove_non_alpha**. If non alphabet symbols should be removed.
        # * **remove_extra_space**. If leading, trailing and double spaces
        #     should be removed.
        # * **remove_bad**. If bad words should be removed
        dc_opts = list(
            "min_words" = 2,
            "line_count" = 1000,
            "save_data" = T,
            "output_file" = NULL,
            "sw_file" = NULL,
            "dict_file" = NULL,
            "bad_file" = NULL,
            "to_lower" = T,
            "remove_stop" = F,
            "remove_punct" = T,
            "remove_non_dict" = T,
            "remove_non_alpha" = T,
            "remove_extra_space" = T,
            "remove_bad" = F
        ),

        # @field sw The list of stop words.
        sw = list(),

        # @field bw The list of bad words.
        bw = list(),

        # @field dw The list of dictionary words.
        dw = list(),

        # @description
        # Performs processing for the clean_file function.
        # It processes the given lines of text. It divides the given lines of
        # text into sentences by spliting on '.'. Each sentence is then cleaned
        # using clean_lines. If the number of words in the cleaned
        # sentence is less than min_words, then the sentence is rejected.
        # @param lines The lines of text to clean.
        # @return The processed line is returned.
        process = function(lines) {
            # The sentence is cleaned
            cl <- self$clean_lines(lines)

            return(cl)
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.