R/data-analyzer.R

#' Analyzes input text files and n-gram token files
#'
#' @description
#' It provides a method that returns information about text files, such as
#' number of lines and number of words. It also provides a method that displays
#' bar plots of n-gram frequencies. Additionally it provides a method for
#' searching for n-grams in a n-gram token file. This file is generated using
#' the TokenGenerator class.
#'
#' @details
#' It provides a method that returns text file information. The text
#' file information includes total number of lines, max, min and mean line
#' length and file size.
#'
#' It also provides a method that generates a bar plot showing the most common
#' n-gram tokens.
#'
#' Another method is provided which returns a list of n-grams that match the
#' given regular expression.
#' @importFrom ggplot2 ggplot geom_bar ggtitle coord_flip ylab xlab aes ggsave
DataAnalyzer <- R6::R6Class(
    "DataAnalyzer",
    inherit = Base,
    public = list(
        #' @description
        #' It initializes the current object. It is used to set the file name
        #' and verbose options.
        #' @param fn The path to the input file.
        #' @param ve The level of detail in the information messages.
        #' @export
        initialize = function(fn = NULL, ve = 0) {
            # The file name is set
            private$fn <- fn
            # The processed output is initialized
            private$p_output <- data.frame()
            # The verbose options is set
            private$ve <- ve
        },

        #' @description
        #' It allows generating two type of n-gram plots. It first reads n-gram
        #' token frequencies from an input text file. The n-gram frequencies are
        #' displayed in a bar plot.
        #'
        #' The type of plot is specified by the type option. The type options
        #' can have the values 'top_features' or 'coverage'. 'top_features'
        #' displays the top n most occurring tokens along with their
        #' frequencies. 'coverage' displays the number of words along with their
        #' frequencies.
        #'
        #' The plot stats are returned as a data frame.
        #' @param opts The options for analyzing the data.
        #' * **type**. The type of plot to display. The options are:
        #'   'top_features', 'coverage'.
        #' * **n**. For 'top_features', it is the number of top most occurring
        #'   tokens. For 'coverage' it is the first n frequencies.
        #' * **save_to**. The graphics devices to save the plot to.
        #'   NULL implies plot is printed.
        #' * **dir**. The output directory where the plot will be saved.
        #' @return A data frame containing the stats.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL value implies tempdir will
        #' # be used.
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("n2.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The n-gram file name
        #' nfn <- paste0(ed, "/n2.RDS")
        #' # The DataAnalyzer object is created
        #' da <- DataAnalyzer$new(nfn, ve = ve)
        #' # The top features plot is checked
        #' df <- da$plot_n_gram_stats(opts = list(
        #'     "type" = "top_features",
        #'     "n" = 10,
        #'     "save_to" = NULL,
        #'     "dir" = ed
        #' ))
        #' # N-gram statistics are displayed
        #' print(df)
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        plot_n_gram_stats = function(opts) {
            # The information message is shown
            private$dh("Displaying Plot", "-", md = 1)
            # The n-gram data is read
            df <- private$read_obj(private$fn)
            # If the coverage option was specified
            if (opts[["type"]] == "coverage") {
                # The y values
                y <- as.character(1:opts[["n"]])
                # The x values
                x <- numeric()
                # The percentage frequencies is calculated
                for (i in 1:opts[["n"]]) {
                    # The percentage of tokens with frequency i
                    x[i] <- 100 * (nrow(df[df$freq == i, ]) / nrow(df))
                    # The percentage is rounded to 2 decimal places
                    x[i] <- round(x[i], 2)
                }
                # A data frame is created
                df <- data.frame("freq" = x, "pre" = y)
                # The plot labels
                labels <- list(
                    y = "Percentage of total",
                    x = "Word Frequency",
                    title = "Coverage"
                )
            }
            # If the top_features option was specified
            else if (opts[["type"]] == "top_features") {
                # The plot labels
                labels <- list(
                    y = "Frequency",
                    x = "Feature",
                    title = paste("Top", opts[["n"]], "Features")
                )
            }
            # The freq column is converted to numeric
            df$freq <- as.numeric(df$freq)
            # The pre column is converted to character
            df$pre <- as.character(df$pre)
            # The data frame is sorted in descending order
            df <- (df[order(df$freq, decreasing = T), ])
            # The top n terms are extracted
            df <- df[1:opts[["n"]], ]
            # The chart is plotted
            g <- private$display_plot(df, labels)

            # If the save_to and dir options are not NULL
            if (!is.null(opts[["save_to"]]) && !is.null(opts[["dir"]])) {
                # The file name for the plot
                fn <- paste0(opts[["type"]], ".", opts[["save_to"]])
                # The plot object is saved
                ggsave(
                    filename = fn,
                    plot = g,
                    device = opts[["save_to"]],
                    path = opts[["dir"]],
                    width = 7,
                    height = 7,
                    units = "in"
                )
            }
            else {
                # The plot is printed
                print(g)
            }
            # The information message is shown
            private$dh("DONE", "=", md = 1)

            return(df)
        },

        #' @description
        #' It generates information about text files. It takes as input a file
        #' or a directory containing text files. For each file it calculates the
        #' total number of lines, maximum, minimum and mean line lengths and the
        #' total file size. The file information is returned as a data frame.
        #' @param res The name of a directory or a file name.
        #' @return A data frame containing the text file statistics.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("test.txt")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The test file name
        #' cfn <- paste0(ed, "/test.txt")
        #' # The DataAnalyzer object is created
        #' da <- DataAnalyzer$new(ve = ve)
        #' # The file info is fetched
        #' fi <- da$get_file_info(cfn)
        #' # The file information is printed
        #' print(fi)
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        get_file_info = function(res) {
            # The information message is shown
            private$dh("Generating file stats", "-", md = 1)
            # The list of files to check
            fl <- NULL
            # If a directory name was passed
            if (dir.exists(res)) {
                # All files in the directory are fetched
                fl <- dir(res, full.names = T, pattern = "*.txt")
            }
            # If a file name was passed
            else if (file.exists(res)) {
                # The file name is set
                fl <- res
            }

            # Used to store overall information about files
            ostats <- data.frame(
                "total_lc" = 0,
                "max_ll" = 0,
                "min_ll" = 0,
                "mean_ll" = 0,
                "total_s" = 0
            )

            # Used to store information about each file
            fstats <- tstats <- data.frame()

            # Temporary variables for calculating max, min, mean line length
            temp_max <- temp_min <- temp_mean <- 0

            # For each file in the list
            for (fn in fl) {
                # The file is read
                lines <- private$read_file(fn, F)
                # The line count
                lc <- length(lines)
                # The file size
                size <- file.size(fn)
                # The file stats are updated
                ostats[["total_s"]] <- ostats[["total_s"]] + size
                ostats[["total_lc"]] <- ostats[["total_lc"]] + lc

                # The temporary variables are updated
                temp_max <- max(nchar(lines))
                temp_min <- min(nchar(lines))
                temp_mean <- round(mean(nchar(lines)))

                # The file stats are updated
                tstats <- data.frame(
                    "fn" = fn,
                    "total_lc" = lc,
                    "max_ll" = temp_max,
                    "min_ll" = temp_min,
                    "mean_ll" = temp_mean,
                    "size" = size
                )
                # The size is formatted
                tstats["size"] <-
                    utils:::format.object_size(tstats["size"], "auto")

                # The file stats are appended
                fstats <- rbind(fstats, tstats)

                if (temp_max > ostats["max_ll"]) {
                    ostats["max_ll"] <- temp_max
                }
                if (temp_min > ostats["min_ll"]) {
                    ostats["min_ll"] <- temp_min
                }
                if (temp_mean > ostats["mean_ll"]) {
                    ostats["mean_ll"] <- temp_mean
                }
            }
            # The total size is formatted
            ostats["total_s"] <-
                utils:::format.object_size(ostats["total_s"], "auto")

            # The required stats
            stats <- list("file_stats" = fstats, "overall_stats" = ostats)
            # The information message is shown
            private$dh("DONE", "=", md = 1)

            # The required stats are returned
            return(stats)
        },

        #' @description
        #' It extracts a given number of n-grams and their frequencies from a
        #' n-gram token file.
        #'
        #' The prefix parameter specifies the regular expression for matching
        #' n-grams. If this parameter is not specified then the given number of
        #' n-grams are randomly chosen.
        #' @param fn The n-gram file name.
        #' @param c The number of n-grams to return.
        #' @param pre The n-gram prefix, given as a regular expression.
        #' @examples
        #' # Start of environment setup code
        #' # The level of detail in the information messages
        #' ve <- 0
        #' # The name of the folder that will contain all the files. It will be
        #' # created in the current directory. NULL implies tempdir will be used
        #' fn <- NULL
        #' # The required files. They are default files that are part of the
        #' # package
        #' rf <- c("n2.RDS")
        #' # An object of class EnvManager is created
        #' em <- EnvManager$new(ve = ve, rp = "./")
        #' # The required files are downloaded
        #' ed <- em$setup_env(rf, fn)
        #' # End of environment setup code
        #'
        #' # The n-gram file name
        #' nfn <- paste0(ed, "/n2.RDS")
        #' # The DataAnalyzer object is created
        #' da <- DataAnalyzer$new(nfn, ve = ve)
        #' # Bi-grams starting with "and_" are returned
        #' df <- da$get_ngrams(fn = nfn, c = 10, pre = "^and_*")
        #' # The data frame is sorted by frequency
        #' df <- df[order(df$freq, decreasing = TRUE),]
        #' # The data frame is printed
        #' print(df)
        #'
        #' # The test environment is removed. Comment the below line, so the
        #' # files generated by the function can be viewed
        #' em$td_env()
        get_ngrams = function(fn, c = NULL, pre = NULL) {
            # The data is read
            df <- private$read_obj(fn)
            # If the prefix is not given
            if (is.null(pre)) {
                seq_l
                # The sample indexes
                i <- sample(seq_len(nrow(df)), c)
                # The n-gram samples
                s <- df[i, ]
            }
            else {
                # The n-gram samples
                s <- df[grepl(pre, df$pre), ]
            }
            return(s)
        }
    ),
    private = list(
        # @field da_opts The options for data analyzer object.
        # * **type**. The type of plot to display. The options are:
        #     'top_features', 'coverage'.
        # * **n**. For 'top_features', it is the number of top most occurring
        #     tokens.
        da_opts = list(
            "type" = "top_features",
            "n" = 10
        ),

        # @description
        # Displays a plot using ggplot2. The plot is a horizontal
        # bar plot filled with red. It has the given labels and main title
        # @param df The data to plot. It is a data frame with prefix and freq
        #   columns.
        # @param labels The main title, x and y axis labels.
        # @return The ggplot object is returned.
        display_plot = function(df, labels) {
            # The n-gram names and their frequencies are plotted
            g <- ggplot(data = df, aes(x = reorder(pre, freq), y = freq)) +
                geom_bar(stat = "identity", fill = "red") +
                ggtitle(labels[["title"]]) +
                coord_flip() +
                ylab(labels[["y"]]) +
                xlab(labels[["x"]])
            return(g)
        }
    )
)

Try the wordpredictor package in your browser

Any scripts or data that you put into this service are public.

wordpredictor documentation built on Jan. 4, 2022, 5:07 p.m.