Nothing
#' Provides data cleaning functionality
#'
#' @description
#' It provides a memory efficient method for removing unneeded
#' characters from text files. It is suitable for cleaning large text files.
#'
#' @details
#' It provides a method for cleaning text files. It allows removing bad words,
#' stop words, non dictionary words, extra space, punctuation and non-alphabet
#' characters. It also allows conversion to lower case. It supports large text
#' files.
#'
#' @importFrom stringr str_count boundary
DataCleaner <- R6::R6Class(
"DataCleaner",
inherit = Base,
public = list(
#' @description
#' It initializes the current object. It is used to set the file name
#' and verbose options.
#' @param fn The path to the file to clean.
#' @param opts The options for data cleaning.
#' * **min_words**. The minimum number of words per sentence.
#' * **line_count**. The number of lines to read and clean at a time.
#' * **save_data**. If the combined processed lines should be saved.
#' * **output_file**. Name of the output file used to store the data.
#' * **sw_file**. The stop words file path.
#' * **dict_file**. The dictionary file path.
#' * **bad_file**. The bad words file path.
#' * **to_lower**. If the words should be converted to lower case.
#' * **remove_stop**. If stop words should be removed.
#' * **remove_punct**. If punctuation symbols should be removed.
#' * **remove_non_dict**. If non dictionary words should be removed.
#' * **remove_non_alpha**. -> If non alphabet symbols should be removed.
#' * **remove_extra_space**. -> If leading, trailing and double spaces
#' should be removed.
#' * **remove_bad**. If bad words should be removed
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(fn = NULL, opts = list(), ve = 0) {
# An object of class EnvManager is created
em <- EnvManager$new(ve)
# The stop words file is checked
opts[["sw_file"]] <- em$get_data_fn(
opts[["sw_file"]], "stop-words.txt"
)
# The bad words file is checked
opts[["bad_file"]] <- em$get_data_fn(
opts[["bad_file"]], "bad-words.txt"
)
# The dict words file is checked
opts[["dict_file"]] <- em$get_data_fn(
opts[["dict_file"]], "dict-no-bad.txt"
)
# The given options are merged with the opts attribute
private$dc_opts <- modifyList(private$dc_opts, opts)
# The base class is initialized
super$initialize(fn, private$dc_opts[["line_count"]], ve)
# The stop words file is read
private$sw <- private$read_file(private$dc_opts[["sw_file"]], F)
# The dictionary file is read
private$dw <- private$read_file(private$dc_opts[["dict_file"]], F)
# The bad word file is read
private$bw <- private$read_file(private$dc_opts[["bad_file"]], F)
# If the output file name is not given, then the default file name
# is used. The default file name is generated by appending "-test"
# to the input file name.
if (!is.null(fn) && is.null(private$dc_opts[["output_file"]])) {
# The default file name
dfn <- gsub(".txt", "-clean.txt", fn)
# The default file name is set
private$dc_opts[["output_file"]] <- dfn
# The information message
msg <- paste0("Output file name not given.")
msg <- paste0(msg, " Using the default file name: ", dfn, "\n")
# The information message is shown
private$dm(msg, md = 1, ty = "w")
}
# The save_data option of base class is set
private$opts[["save_data"]] <- private$dc_opts[["save_data"]]
# The output_file option of base class is set
private$opts[["output_file"]] <- private$dc_opts[["output_file"]]
},
#' @description
#' It provides an efficient method for cleaning text files.
#' It removes unneeded characters from the given text file with several
#' options.
#'
#' It allows removing punctuation, bad words, stop words,
#' non-alphabetical symbols and non-dictionary words. It reads a certain
#' number of lines from the given text file. It removes unneeded
#' characters from the lines and then saves the lines to an output text
#' file.
#'
#' File cleaning progress is displayed if the verbose option was
#' set in the class constructor. It is suitable for cleaning large text
#' files.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("test.txt")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The cleaned test file name
#' cfn <- paste0(ed, "/test-clean.txt")
#' # The test file name
#' fn <- paste0(ed, "/test.txt")
#' # The data cleaning options
#' dc_opts <- list("output_file" = cfn)
#' # The data cleaner object is created
#' dc <- DataCleaner$new(fn, dc_opts, ve = ve)
#' # The sample file is cleaned
#' dc$clean_file()
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
clean_file = function() {
# The information message is shown
private$dh("Cleaning file", "-", md = 1)
# The base class process_file function is called
private$process_file(
private$pre_process, private$process,
private$post_process
)
# The information message is shown
private$dh("DONE", "=", md = 1)
# If the data should not be saved
if (!private$dc_opts[["save_data"]]) {
# The processed output is returned
return(private$p_output)
}
},
#' @description
#' It cleans the given lines of text using the options
#' passed to the current object.
#' @param lines The input sentences.
#' @return The cleaned lines of text.
#' @examples
#' # The level of detail in the information messages
#' ve <- 0
#' # Test data is read
#' l <- c(
#' "If you think I'm wrong, send me a link to where it's happened",
#' "We're about 90percent done with this room",
#' "This isn't how I wanted it between us.",
#' "Almost any cute breed can become ornamental",
#' "Once upon a time there was a kingdom with a castle",
#' "That's not a thing any of us are granted'",
#' "Why are you being so difficult? she asks."
#' )
#' # The expected results
#' res <- c(
#' "if you think wrong send me a link to where its happened",
#' "were about percent done with this room",
#' "this how i wanted it between us",
#' "almost any cute breed can become ornamental",
#' "once upon a time there was a kingdom with a castle",
#' "thats not a thing any of us are granted",
#' "why are you being so difficult she asks"
#' )
#' # The DataCleaner object is created
#' dc <- DataCleaner$new(ve = ve)
#' # The line is cleaned
#' cl <- dc$clean_lines(l)
#' # The cleaned lines are printed
#' print(cl)
clean_lines = function(lines) {
# The lines to clean
l <- lines
# If a line does not end with a ".", then "." is appended to the
# line
l <- gsub("(.+[^\\.])$", "\\1.", l)
# The "." character is replaced with the string "specialdotsep"
l <- gsub("\\.", " specialdotsep ", l)
# If the words should be converted to lower case
if (private$dc_opts[["to_lower"]]) {
# The information message
private$dm("Converting lines to lower case\n", md = 3)
# The line is converted to lower case
l <- tolower(l)
}
# If punctuation symbols should be removed
if (private$dc_opts[["remove_punct"]]) {
# The information message
private$dm("Removing punctuation symbols\n", md = 3)
# The pattern for removing all punctuation symbols
l <- gsub("[[:punct:]\u2026\u2019\u201c\u201d]", "", l)
}
# If non alphabet symbols should be removed
if (private$dc_opts[["remove_non_alpha"]]) {
# The information message
private$dm("Removing non alphabet symbols\n", md = 3)
# Words containing non alphabetical characters are removed
l <- gsub("([^[:alpha:]\\s])", "", l, perl = T)
}
# If stop words should be removed
if (private$dc_opts[["remove_stop"]]) {
# The information message
private$dm("Removing stop words\n", md = 3)
# Stop words are collapsed
sw <- paste(private$sw, collapse = "|")
swp <- paste("\\b(", sw, ")\\b", sep = "")
# The stop words are removed
l <- gsub(swp, "", l)
}
# The words in the lines are extracted
words <- strsplit(l, split = " ")
# The words are converted to an atomic list
words <- unlist(words)
# If non dictionary words should be removed
if (private$dc_opts[["remove_non_dict"]]) {
# The information message
private$dm("Removing non dictionary words\n", md = 3)
# The "specialdotsep" string is added to list of dictionary
# words
dw <- c(private$dw, "specialdotsep")
# The non dictionary words are removed from the data
words <- words[words %in% dw]
# All 1 length words except for 'a' and 'i' are removed
# The indexes position of all words that are "a" or "i"
i1 <- (words == "a" | words == "i")
# The index position of words of length 2 or more
i2 <- (nchar(words) > 1)
# The list of all words of length 2 or more including "a" and
# "i"
words <- words[i1 | i2]
}
# If bad words should be removed
if (private$dc_opts[["remove_bad"]]) {
# The information message
private$dm("Removing bad words\n", md = 3)
# The "specialdotsep" string is added to list of bad words
bw <- c(private$bw, "specialdotsep")
# The bad words are removed from the data
words <- words[!words %in% bw]
}
# The words are combined with space
l <- paste(words, collapse = " ")
# The "specialdotsep" string is replaced with "."
l <- gsub("specialdotsep", ".", l)
# The sentences in the lines are extracted
l <- strsplit(l, split = "\\.")
# The sentences are converted to an atomic list
l <- unlist(l)
# If each sentence should have a minimum number of words
if (private$dc_opts[["min_words"]] > -1) {
# The information message
msg <- paste0("Removing lines that have less than ")
msg <- paste0(msg, private$dc_opts[["min_words"]], " words\n")
# The information message
private$dm(msg, md = 3)
# The number of words in each sentence
wc <- str_count(l, pattern = boundary("word"))
# The lines containing less than min_words number of words are
# removed
l <- l[wc >= private$dc_opts[["min_words"]]]
}
# Consecutive 'a' and 'i' are replaced with single 'a' or 'i'
l <- gsub("(a\\s){2,}", "\\1 ", l)
l <- gsub("(i\\s){2,}", "\\1 ", l)
l <- gsub("a$", "", l)
# If extra spaces should be removed
if (private$dc_opts[["remove_extra_space"]]) {
# The information message
private$dm("Removing extra spaces\n", md = 3)
# Multiple spaces are replaced by single space
l <- gsub("\\s{2,}", " ", l)
# Leading and trailing whitespaces are removed
l <- trimws(l)
}
return(l)
}
),
private = list(
# @field dc_opts The options for the data cleaner object.
# * **min_words**. The minimum number of words per sentence.
# * **line_count**. The number of lines to read and clean at a time.
# * **save_data**. If the combined processed lines should be saved.
# * **output_file**. Name of the output file used to store the data.
# * **sw_file**. The stop words file path.
# * **dict_file**. The dictionary file path.
# * **bad_file**. The bad words file path.
# * **to_lower**. If the words should be converted to lower case.
# * **remove_stop**. If stop words should be removed.
# * **remove_punct**. If punctuation symbols should be removed.
# * **remove_non_dict**. If non dictionary words should be removed.
# * **remove_non_alpha**. If non alphabet symbols should be removed.
# * **remove_extra_space**. If leading, trailing and double spaces
# should be removed.
# * **remove_bad**. If bad words should be removed
dc_opts = list(
"min_words" = 2,
"line_count" = 1000,
"save_data" = T,
"output_file" = NULL,
"sw_file" = NULL,
"dict_file" = NULL,
"bad_file" = NULL,
"to_lower" = T,
"remove_stop" = F,
"remove_punct" = T,
"remove_non_dict" = T,
"remove_non_alpha" = T,
"remove_extra_space" = T,
"remove_bad" = F
),
# @field sw The list of stop words.
sw = list(),
# @field bw The list of bad words.
bw = list(),
# @field dw The list of dictionary words.
dw = list(),
# @description
# Performs processing for the clean_file function.
# It processes the given lines of text. It divides the given lines of
# text into sentences by spliting on '.'. Each sentence is then cleaned
# using clean_lines. If the number of words in the cleaned
# sentence is less than min_words, then the sentence is rejected.
# @param lines The lines of text to clean.
# @return The processed line is returned.
process = function(lines) {
# The sentence is cleaned
cl <- self$clean_lines(lines)
return(cl)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.