Nothing
#' Generates n-grams from text files
#'
#' @description
#' It generates n-gram tokens along with their frequencies. The data
#' may be saved to a file in plain text format or as a R object.
#'
#' @importFrom SnowballC wordStem
#' @importFrom dplyr group_by summarize_all %>%
TokenGenerator <- R6::R6Class(
"TokenGenerator",
inherit = Base,
public = list(
#' @description
#' It initializes the current obj. It is used to set the file name,
#' tokenization options and verbose option.
#' @param fn The path to the input file.
#' @param opts The options for generating the n-gram tokens.
#' * **n**. The n-gram size.
#' * **save_ngrams**. If the n-gram data should be saved.
#' * **min_freq**. All n-grams with frequency less than min_freq are
#' ignored.
#' * **line_count**. The number of lines to process at a time.
#' * **stem_words**. If words should be transformed to their stems.
#' * **dir**. The dir where the output file should be saved.
#' * **format**. The format for the output. There are two options.
#' * **plain**. The data is stored in plain text.
#' * **obj**. The data is stored as a R obj.
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(fn = NULL, opts = list(), ve = 0) {
# The given options are merged with the opts attribute
private$tg_opts <- modifyList(private$tg_opts, opts)
# The base class is initialized
super$initialize(fn, private$tg_opts$line_count, ve)
# The processed output is initialized
private$p_output <- NULL
},
#' @description
#' It generates n-gram tokens and their frequencies from the
#' given file name. The tokens may be saved to a text file as plain text
#' or a R object.
#' @return The data frame containing n-gram tokens along with their
#' frequencies.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("test-clean.txt")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The n-gram size
#' n <- 4
#' # The test file name
#' tfn <- paste0(ed, "/test-clean.txt")
#' # The n-gram number is set
#' tg_opts <- list("n" = n, "save_ngrams" = TRUE, "dir" = ed)
#' # The TokenGenerator object is created
#' tg <- TokenGenerator$new(tfn, tg_opts, ve = ve)
#' # The n-gram tokens are generated
#' tg$generate_tokens()
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
generate_tokens = function() {
# The processed output is initialized
private$p_output <- NULL
# The output file name
fn <- private$get_file_name()
# If the output file already exists
if (file.exists(fn)) {
# The information message is shown
private$dm("The ",
private$tg_opts[["n"]],
"-gram file already exists\n",
md = 1,
ty = "w"
)
# If the n-gram data should not be saved
if (!private$tg_opts[["save_ngrams"]]) {
# The n-grams file is read
private$p_output <- private$read_data(
fn, private$tg_opts[["format"]], T
)
}
}
else {
# The information message
msg <- paste0("Generating ", private$tg_opts[["n"]])
msg <- paste0(msg, "-gram tokens")
# The information message is shown
private$dh(msg, "-", md = 1)
# The base class process_file function is called
private$process_file(
private$pre_process, private$process,
private$post_process
)
# The information message is shown
private$dh("DONE", "=", md = 1)
}
}
),
private = list(
# @field tg_opts The options for the token generator obj.
# * **n**. The n-gram size.
# * **save_ngrams**. If the n-gram data should be saved.
# * **min_freq**. All n-grams with frequency less than min_freq are
# ignored.
# * **stem_words**. If words should be transformed to their stems.
# * **line_count**. The number of lines to process at a time.
# * **dir**. The dir where the output file should be saved.
# * **format**. The format for the output. There are two options.
# * **plain**. The data is stored in plain text.
# * **obj**. The data is stored as a R obj.
tg_opts = list(
"n" = 1,
"save_ngrams" = F,
"stem_words" = F,
"min_freq" = -1,
"line_count" = 5000,
"dir" = NULL,
"format" = "obj"
),
# @description
# Performs processing for the generate_tokens function. It
# processes the given line of text. It converts each line of text into
# n-grams of the given size. The frequency of each n-gram is updated.
# @param lines The lines of text.
process = function(lines) {
# n-grams are extracted from each line
ngrams <- private$generate_ngrams(lines)
# The n-gram words are appended to the processed output
private$p_output <- c(private$p_output, ngrams)
},
# @description
# It returns the name of the output n-gram file.
get_file_name = function() {
# The n-gram number
n <- private$tg_opts[["n"]]
# The format
fo <- private$tg_opts[["format"]]
# The output directory
dir <- private$tg_opts[["dir"]]
# The file extension
if (fo == "plain") {
ext <- ".txt"
} else {
ext <- ".RDS"
}
# The file name
file_name <- paste0(dir, "/n", n, ext)
return(file_name)
},
# @description
# It saves the n-gram tokens and their frequencies to a text file.
post_process = function() {
# The information message
msg <- paste0("Calculating ", private$tg_opts[["n"]])
msg <- paste0(msg, "-gram frequencies")
# The information message is shown
private$dm(msg, md = 1)
# The output is copied to a variable
df <- data.frame("pre" = private$p_output)
# A frequency column is added
df$freq <- 1
# Each prefix is grouped and summed
df <- df %>%
group_by(pre) %>%
summarize_all(sum)
# The information message is shown
private$dm(" \u2714\n", md = 1)
# If the minimum n-gram frequency is given
if (private$tg_opts[["min_freq"]] > -1) {
# The information message is shown
private$dm("Removing low frequency n-grams", md = 1)
# All n-grams with frequency less than min_freq are ignored
df <- df[df$freq >= private$tg_opts[["min_freq"]], ]
# The information message is shown
private$dm(" \u2714\n", md = 1)
}
# The column names are set
colnames(df) <- c("pre", "freq")
# The output is set to the updated variable
private$p_output <- df
# If the n-gram data should be saved
if (private$tg_opts[["save_ngrams"]]) {
# The required file name
fn <- private$get_file_name()
# The format
fo <- private$tg_opts[["format"]]
# The n-gram data frame is written to file
private$write_data(private$p_output, fn, fo, F)
}
# If n-gram data should not be saved
else {
return(private$p_output)
}
},
# @description
# It generates n-gram frequencies for the given lines of text.
# @param lines The lines of text to process
generate_ngrams = function(lines) {
# The n-gram number
n <- private$tg_opts[["n"]]
# If n > 1
if (n > 1) {
# Trailing and leading white space is removed
l <- trimws(lines, "both")
# Start and end of sentence tags are added
l <- gsub("(^)(.+)($)", "<s>\\2<e>", l)
# The lines are split on space
w <- strsplit(l, " ")
# The words are converted to an atomic vector
w <- unlist(w)
# The index of empty words
i <- (w == "")
# The empty words are removed
w <- w[!i]
# The indexes for the words
indexes <- seq(length(w))
# The n-grams are generated
l <- sapply(indexes, function(i) {
# If the words should be stemmed
if (private$tg_opts[["stem_words"]]) {
# The n-gram prefix words are stemmed. The next word is
# not stemmed
v <- c(wordStem(w[i:(i + n - 2)]), w[(i + n - 1)])
}
else {
# The n-gram token
v <- w[i:(i + n - 1)]
}
# The n-gram token
v <- paste0(v, collapse = "_")
# The n-gram token is returned
return(v)
},
simplify = T
)
# Invalid n-grams need to be removed
# A logical vector indicating position of invalid n-grams
i <- grepl(".+<e>.+", l)
# The list of valid n-grams
l <- l[!i]
# The start of sentence tokens are removed
l <- gsub("<s>", "", l)
# The end of sentence tokens are removed
l <- gsub("<e>", "", l)
}
else {
# The line is split on " "
words <- strsplit(lines, " ")
# The list of words is converted to atomic vector
l <- unlist(words)
# The index of empty words
i <- l == ""
# The empty words are removed
l <- l[!i]
}
return(l)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.