Nothing
#' Analyzes input text files and n-gram token files
#'
#' @description
#' It provides a method that returns information about text files, such as
#' number of lines and number of words. It also provides a method that displays
#' bar plots of n-gram frequencies. Additionally it provides a method for
#' searching for n-grams in a n-gram token file. This file is generated using
#' the TokenGenerator class.
#'
#' @details
#' It provides a method that returns text file information. The text
#' file information includes total number of lines, max, min and mean line
#' length and file size.
#'
#' It also provides a method that generates a bar plot showing the most common
#' n-gram tokens.
#'
#' Another method is provided which returns a list of n-grams that match the
#' given regular expression.
#' @importFrom ggplot2 ggplot geom_bar ggtitle coord_flip ylab xlab aes ggsave
DataAnalyzer <- R6::R6Class(
"DataAnalyzer",
inherit = Base,
public = list(
#' @description
#' It initializes the current object. It is used to set the file name
#' and verbose options.
#' @param fn The path to the input file.
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(fn = NULL, ve = 0) {
# The file name is set
private$fn <- fn
# The processed output is initialized
private$p_output <- data.frame()
# The verbose options is set
private$ve <- ve
},
#' @description
#' It allows generating two type of n-gram plots. It first reads n-gram
#' token frequencies from an input text file. The n-gram frequencies are
#' displayed in a bar plot.
#'
#' The type of plot is specified by the type option. The type options
#' can have the values 'top_features' or 'coverage'. 'top_features'
#' displays the top n most occurring tokens along with their
#' frequencies. 'coverage' displays the number of words along with their
#' frequencies.
#'
#' The plot stats are returned as a data frame.
#' @param opts The options for analyzing the data.
#' * **type**. The type of plot to display. The options are:
#' 'top_features', 'coverage'.
#' * **n**. For 'top_features', it is the number of top most occurring
#' tokens. For 'coverage' it is the first n frequencies.
#' * **save_to**. The graphics devices to save the plot to.
#' NULL implies plot is printed.
#' * **dir**. The output directory where the plot will be saved.
#' @return A data frame containing the stats.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL value implies tempdir will
#' # be used.
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("n2.RDS")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The n-gram file name
#' nfn <- paste0(ed, "/n2.RDS")
#' # The DataAnalyzer object is created
#' da <- DataAnalyzer$new(nfn, ve = ve)
#' # The top features plot is checked
#' df <- da$plot_n_gram_stats(opts = list(
#' "type" = "top_features",
#' "n" = 10,
#' "save_to" = NULL,
#' "dir" = ed
#' ))
#' # N-gram statistics are displayed
#' print(df)
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
plot_n_gram_stats = function(opts) {
# The information message is shown
private$dh("Displaying Plot", "-", md = 1)
# The n-gram data is read
df <- private$read_obj(private$fn)
# If the coverage option was specified
if (opts[["type"]] == "coverage") {
# The y values
y <- as.character(1:opts[["n"]])
# The x values
x <- numeric()
# The percentage frequencies is calculated
for (i in 1:opts[["n"]]) {
# The percentage of tokens with frequency i
x[i] <- 100 * (nrow(df[df$freq == i, ]) / nrow(df))
# The percentage is rounded to 2 decimal places
x[i] <- round(x[i], 2)
}
# A data frame is created
df <- data.frame("freq" = x, "pre" = y)
# The plot labels
labels <- list(
y = "Percentage of total",
x = "Word Frequency",
title = "Coverage"
)
}
# If the top_features option was specified
else if (opts[["type"]] == "top_features") {
# The plot labels
labels <- list(
y = "Frequency",
x = "Feature",
title = paste("Top", opts[["n"]], "Features")
)
}
# The freq column is converted to numeric
df$freq <- as.numeric(df$freq)
# The pre column is converted to character
df$pre <- as.character(df$pre)
# The data frame is sorted in descending order
df <- (df[order(df$freq, decreasing = T), ])
# The top n terms are extracted
df <- df[1:opts[["n"]], ]
# The chart is plotted
g <- private$display_plot(df, labels)
# If the save_to and dir options are not NULL
if (!is.null(opts[["save_to"]]) && !is.null(opts[["dir"]])) {
# The file name for the plot
fn <- paste0(opts[["type"]], ".", opts[["save_to"]])
# The plot object is saved
ggsave(
filename = fn,
plot = g,
device = opts[["save_to"]],
path = opts[["dir"]],
width = 7,
height = 7,
units = "in"
)
}
else {
# The plot is printed
print(g)
}
# The information message is shown
private$dh("DONE", "=", md = 1)
return(df)
},
#' @description
#' It generates information about text files. It takes as input a file
#' or a directory containing text files. For each file it calculates the
#' total number of lines, maximum, minimum and mean line lengths and the
#' total file size. The file information is returned as a data frame.
#' @param res The name of a directory or a file name.
#' @return A data frame containing the text file statistics.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("test.txt")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The test file name
#' cfn <- paste0(ed, "/test.txt")
#' # The DataAnalyzer object is created
#' da <- DataAnalyzer$new(ve = ve)
#' # The file info is fetched
#' fi <- da$get_file_info(cfn)
#' # The file information is printed
#' print(fi)
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
get_file_info = function(res) {
# The information message is shown
private$dh("Generating file stats", "-", md = 1)
# The list of files to check
fl <- NULL
# If a directory name was passed
if (dir.exists(res)) {
# All files in the directory are fetched
fl <- dir(res, full.names = T, pattern = "*.txt")
}
# If a file name was passed
else if (file.exists(res)) {
# The file name is set
fl <- res
}
# Used to store overall information about files
ostats <- data.frame(
"total_lc" = 0,
"max_ll" = 0,
"min_ll" = 0,
"mean_ll" = 0,
"total_s" = 0
)
# Used to store information about each file
fstats <- tstats <- data.frame()
# Temporary variables for calculating max, min, mean line length
temp_max <- temp_min <- temp_mean <- 0
# For each file in the list
for (fn in fl) {
# The file is read
lines <- private$read_file(fn, F)
# The line count
lc <- length(lines)
# The file size
size <- file.size(fn)
# The file stats are updated
ostats[["total_s"]] <- ostats[["total_s"]] + size
ostats[["total_lc"]] <- ostats[["total_lc"]] + lc
# The temporary variables are updated
temp_max <- max(nchar(lines))
temp_min <- min(nchar(lines))
temp_mean <- round(mean(nchar(lines)))
# The file stats are updated
tstats <- data.frame(
"fn" = fn,
"total_lc" = lc,
"max_ll" = temp_max,
"min_ll" = temp_min,
"mean_ll" = temp_mean,
"size" = size
)
# The size is formatted
tstats["size"] <-
utils:::format.object_size(tstats["size"], "auto")
# The file stats are appended
fstats <- rbind(fstats, tstats)
if (temp_max > ostats["max_ll"]) {
ostats["max_ll"] <- temp_max
}
if (temp_min > ostats["min_ll"]) {
ostats["min_ll"] <- temp_min
}
if (temp_mean > ostats["mean_ll"]) {
ostats["mean_ll"] <- temp_mean
}
}
# The total size is formatted
ostats["total_s"] <-
utils:::format.object_size(ostats["total_s"], "auto")
# The required stats
stats <- list("file_stats" = fstats, "overall_stats" = ostats)
# The information message is shown
private$dh("DONE", "=", md = 1)
# The required stats are returned
return(stats)
},
#' @description
#' It extracts a given number of n-grams and their frequencies from a
#' n-gram token file.
#'
#' The prefix parameter specifies the regular expression for matching
#' n-grams. If this parameter is not specified then the given number of
#' n-grams are randomly chosen.
#' @param fn The n-gram file name.
#' @param c The number of n-grams to return.
#' @param pre The n-gram prefix, given as a regular expression.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("n2.RDS")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The n-gram file name
#' nfn <- paste0(ed, "/n2.RDS")
#' # The DataAnalyzer object is created
#' da <- DataAnalyzer$new(nfn, ve = ve)
#' # Bi-grams starting with "and_" are returned
#' df <- da$get_ngrams(fn = nfn, c = 10, pre = "^and_*")
#' # The data frame is sorted by frequency
#' df <- df[order(df$freq, decreasing = TRUE),]
#' # The data frame is printed
#' print(df)
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
get_ngrams = function(fn, c = NULL, pre = NULL) {
# The data is read
df <- private$read_obj(fn)
# If the prefix is not given
if (is.null(pre)) {
seq_l
# The sample indexes
i <- sample(seq_len(nrow(df)), c)
# The n-gram samples
s <- df[i, ]
}
else {
# The n-gram samples
s <- df[grepl(pre, df$pre), ]
}
return(s)
}
),
private = list(
# @field da_opts The options for data analyzer object.
# * **type**. The type of plot to display. The options are:
# 'top_features', 'coverage'.
# * **n**. For 'top_features', it is the number of top most occurring
# tokens.
da_opts = list(
"type" = "top_features",
"n" = 10
),
# @description
# Displays a plot using ggplot2. The plot is a horizontal
# bar plot filled with red. It has the given labels and main title
# @param df The data to plot. It is a data frame with prefix and freq
# columns.
# @param labels The main title, x and y axis labels.
# @return The ggplot object is returned.
display_plot = function(df, labels) {
# The n-gram names and their frequencies are plotted
g <- ggplot(data = df, aes(x = reorder(pre, freq), y = freq)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle(labels[["title"]]) +
coord_flip() +
ylab(labels[["y"]]) +
xlab(labels[["x"]])
return(g)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.