SpeedReader: High Performance Text Analysis

Documented in generate_document_term_vectors

#' A function to generate document term vectors from a variety of inputs.
#'
#' @param input A list of strings, term vectors, raw documents, or csv files you wish to turn into document term vectors.
#' @param data_type The type of data provided to the function. Defaults to 'string' in which case the user must provide a vector or list of strings, with each string representing one document. Alternatively the user may specify one of the following input formats: 'term vector', in which case a list of document term vectors is expected; 'raw text', in which case a vector of paths to plain text files should be provided; 'csv', in which case a vector of paths to document term csv files must be specified. If 'csv' is specified, optional arguments csv_separator, csv_word_column, csv_header,  and optionally csv_count_column must also be specified; or "ngrams", in which case a file path to a block of save ngrams
#' extractions generated by the ngrams() function is stored. If "ngrams" is
#' selected, then ngram_type must also be selected.
#' @param ngram_type The type of ngram we wish to use to generate document term
#' vectors. Can be one of ngrams "jk_filtered", "verb_filtered", "phrases", or
#' any of "x_grams" where x is a number specifying the n_gram length. Can only be
#' used with input generated by the ngrams() function. Defaults to NULL.
#' @param data_directory Optional argument specifying where the data is stored.
#' @param tokenization_method Defaults to "None", in which case text input is kept as is. Alternatively, the user may select "RegEx" in which case all characters will be removed from the input that match the regex argument. See clean_document_text() for more examples.
#' @param regex Deaults to removing all characters that are not upper or lowercase letters or spaces. Can be set as desired by the user.
#' @param output_type A string value indicating whether the resulting list object should be returned ("return"), saved to a file in the current working directory with name output_name.Rdata ("save"), of both ("save and return"). Defaults to "return".
#' @param output_name The file name we wish to give the output document term vector list object generated by this function if specifying a value of "save" or "return and save" for output_type. You do not need to provide the .Rdata extension as this is appended automatically. Defaults to NULL.
#' @param output_directory An optional alternate directory where output will be
#' stored. Defaults to NULL, in which case all output is stored in data_directory
#' if output_type != NULL
#' @param csv_separator Defaults to "," but can be set to "*backslash*t" for tab separated values.
#' @param csv_word_column If you are providing one csv file per document, then you must specify the index of the column that contains the words. Defaults to NULL.
#' @param csv_count_column For memory efficiency, you may want to store only the counts of unique words in csv files. If your data include counts, then you must specify the index of the column that contains the counts. Defaults to NULL.
#' @param csv_header Logical indicating whether the csv files provided have a header. Defaults to FALSE.
#' @param keep_sequence Logical indicating whether document term vectors should be condensed and counts (FALSE) or whether the full sequence should be maintained for storage (TRUE). Defaults to FALSE as this can be a much more memory efficient representation.
#' @return A document term vector list.
#' @export
generate_document_term_vectors <- function(
    input,
    data_type = c("string","term vector","raw text","csv","ngrams"),
    ngram_type = NULL,
    data_directory = NULL,
    tokenization_method = c("None","RegEx"),
    regex = "[^a-zA-Z\\s]",
    output_type = c("return", "save", "return and save"),
    output_name = NULL,
    output_directory = NULL,
    csv_separator = ",",
    csv_word_column = NULL,
    csv_count_column = NULL,
    csv_header = FALSE,
    keep_sequence = FALSE){

    # deal with default values
    if (length(data_type) > 1) {
        data_type <- data_type[1]
    }
    if (length(tokenization_method) > 1) {
        tokenization_method <- tokenization_method[1]
    }
    if (length(output_type) > 1) {
        output_type <- output_type[1]
    }

    # get current working directory
    current_directory <- getwd()
    # now set the working directory if one was provided
    if (!is.null(data_directory)) {
        setwd(data_directory)
        if (is.null(output_directory)) {
            output_directory <- data_directory
        }
    }

    # check to make sure an output name is provided if we are saving
    if (output_type != "return") {
        if (is.null(output_name)) {
            stop("You must provide a valid output name if you wish to save output to disk. You do not need to provide the .Rdata extension as this is appended automatically.\n")
        }
    }

    # check to make sure the user did not provide an .Rdata or .rdata extension
    substrRight <- function(x, n){
        substr(x, nchar(x) - n + 1, nchar(x))
    }

    # remove the trailing file type if necessary
    if (!is.null(output_name)) {
        if (substrRight(output_name,6) == ".Rdata" |
           substrRight(output_name,6) == ".rdata") {
            output_name <- substr(output_name, 1, nchar(output_name) - 6)
        }
        if (substrRight(output_name,4) == ".rda") {
            output_name <- substr(output_name, 1, nchar(output_name) - 4)
        }
    }

    # allocate list objects
    document_term_vector_list <- vector(mode = "list", length = length(input))
    document_term_count_list <- vector(mode = "list", length = length(input))

    #**********************#
    #****** CSV Data ******#
    #**********************#
    if (data_type == "csv") {

        # if csv_count_column is not NULL, then warn the user that we are not
        # keeping sequence
        if (!is.null(csv_count_column) &  keep_sequence) {
            keep_sequence <- FALSE
            warning("You provided a csv_count_column so keep_sequence was set to FALSE. If you intended to keep sequence, then set csv_count_column = NULL.\n")
        }

        if(!keep_sequence & is.null(csv_count_column)){
            stop("You must provide a csv_count_column index if keep_sequence == FALSE.\n")
        }

        if(class(input) == "list"){
            input <- unlist(input)
        }

        if(keep_sequence){
            for(i in 1:length(input)){
                data <- readr::read_delim(file = input[i],
                                        delim = csv_separator,
                                        col_names = csv_header)
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = as.character(data[,csv_word_column]), regex = regex)
                    document_term_vector_list[[i]] <- clean
                }else{
                    document_term_vector_list[[i]] <- as.character(data[,csv_word_column])
                }
            }

        }else{
            for(i in 1:length(input)){
                cat("Reading in file",i,"of",length(input),"\n")
                data <- readr::read_delim(file = input[i],
                                 delim = csv_separator,
                                 col_names = csv_header)
                # deal with the new tibble format from readr.
                data <- as.data.frame(data, stringsAsFactors = FALSE)
                temp1 <- list(as.character(data[,csv_word_column]))
                temp2 <- list(counts = as.numeric(data[,csv_count_column]))

                vocab <- count_words(
                    document_term_vector_list = temp1,
                    maximum_vocabulary_size = -1,
                    document_term_count_list = temp2)
                document_term_vector_list[[i]] <- vocab$unique_words
                document_term_count_list[[i]] <- vocab$word_counts
            }

        }
    }# end of csv data type loop

    #******************************************#
    #****** One String Per Document Data ******#
    #******************************************#
    if(data_type == "string"){

        if(class(input) == "list"){
            input <- unlist(input)
        }

        if(class(input) != "character"){
            stop("If data_type == 'string' then you must provide documents as either a character vector with one entry per document or a list with one entry per document." )
        }

        if(keep_sequence){
            for(i in 1:length(input)){
                cat("Reading in document",i,"of",length(input),"\n")
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = as.character(input[i]),
                                                 regex = regex)
                    document_term_vector_list[[i]] <- clean
                }else{
                    stop("tokenization_method = 'RegEx' is the only method currently available for tokenization of document when provided as strings. Please use this option.")
                }
            }
        }else{
            for(i in 1:length(input)){
                cat("Reading in document",i,"of",length(input),"\n")
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = as.character(input[i]),
                                                 regex = regex)
                    vocab <- count_words(
                        document_term_vector_list = list(as.character(clean)),
                        maximum_vocabulary_size = -1,
                        document_term_count_list = NULL)
                    document_term_vector_list[[i]] <- vocab$unique_words
                    document_term_count_list[[i]] <- vocab$word_counts
                }else{
                    stop("tokenization_method = 'RegEx' is the only method currently available for tokenization of document when provided as strings. Please use this option.")
                }
            }
        }

    }# end of string data type

    #********************************************#
    #****** Document Term Vector Data Data ******#
    #********************************************#

    if(data_type == "term vector"){

        if(class(input) == "character"){
            input <- list(doc = input)
            cat("You provided a string vector so this function is assuming that you only provided one document and proceeding accordingly.")
        }

        if(class(input) != "list"){
            stop("If data_type == 'term vector' then you must provide documents as a list with one term vector per document." )
        }

        if(keep_sequence){
            for(i in 1:length(input)){
                cat("Reading in document",i,"of",length(input),"\n")
                document_term_vector_list[[i]] <- input[[i]]
            }
        }else{
            for(i in 1:length(input)){
                cat("Reading in document",i,"of",length(input),"\n")
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = as.character(input[[i]]),
                                                 regex = regex)
                    vocab <- count_words(
                        document_term_vector_list = list(as.character(clean)),
                        maximum_vocabulary_size = -1,
                        document_term_count_list = NULL)
                    document_term_vector_list[[i]] <- vocab$unique_words
                    document_term_count_list[[i]] <- vocab$word_counts
                }else{
                    vocab <- count_words(
                        document_term_vector_list = input[i],
                        maximum_vocabulary_size = -1,
                        document_term_count_list = NULL)
                    document_term_vector_list[[i]] <- vocab$unique_words
                    document_term_count_list[[i]] <- vocab$word_counts
                }
            }
        }

    }# end of term vector data type


    #***************************#
    #****** Raw Text Data ******#
    #***************************#
    if(data_type == "raw text"){

        if(class(input) == "list"){
            input <- unlist(input)
        }

        if(keep_sequence){
            for(i in 1:length(input)){
                cat("Reading in file",i,"of",length(input),"\n")
                data <- readr::read_lines(file = input[i])
                data <- paste0(data, collapse = " ")
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = data, regex = regex)
                    document_term_vector_list[[i]] <- clean
                }else{
                    stop("If data_type = 'raw text' then the user must specify tokenization_method = 'RegEx' and give a valid regualr expression for characters to remove (or simply use the default) ")
                }
            }
        }else{
            for(i in 1:length(input)){
                cat("Reading in file",i,"of",length(input),"\n")
                data <- readr::read_lines(file = input[i])
                data <- paste0(data, collapse = " ")
                if(tokenization_method == "RegEx"){
                    clean <- clean_document_text(text = data, regex = regex)
                    vocab <- count_words(
                        document_term_vector_list = list(clean),
                        maximum_vocabulary_size = -1,
                        document_term_count_list = NULL)
                    document_term_vector_list[[i]] <- vocab$unique_words
                    document_term_count_list[[i]] <- vocab$word_counts
                }else{
                    stop("If data_type = 'raw text' then the user must specify tokenization_method = 'RegEx' and give a valid regualr expression for characters to remove (or simply use the default) ")
                }
            }
        }
    }# end of raw text data type loop


    #************************************#
    #****** NGram Extractions Data ******#
    #************************************#

    if (data_type == "ngrams") {
        if (is.null(ngram_type)) {
            stop("You must specify a valid ngram_type.")
        }
        # load the ngram extractions
        NGrams <- NULL
        # load the extractions
        load(input)
        # make sure we keep sequence
        keep_sequence <- TRUE
        # now get the real length of the doc term vector list
        document_term_vector_list <- vector(mode = "list",
                                            length = length(NGrams))
        for (i in 1:length(NGrams)) {
            if (ngram_type == "jk_filtered") {
                document_term_vector_list[[i]] <- NGrams[[i]]$jk_filtered
            } else if (ngram_type == "verb_filtered") {
                document_term_vector_list[[i]] <- NGrams[[i]]$verb_filtered
            } else if (ngram_type == "phrases") {
                document_term_vector_list[[i]] <- NGrams[[i]]$phrases
            } else {
                num <- as.numeric(stringr::str_split(ngram_type,"_")[[1]][1])
                index <- grep(num, names(NGrams[[i]]$ngrams))[1]
                document_term_vector_list[[i]] <- NGrams[[i]]$ngrams[[index]]
            }
        }
    }# end of ngrams conditional


    #*************************************#
    #****** Return Term Vector List ******#
    #*************************************#
    if(output_type == "return"){
        if(keep_sequence){
            return(document_term_vector_list)
        }else{
            return_list <- list(
                document_term_vector_list = document_term_vector_list,
                document_term_count_list = document_term_count_list
                                )
            return(return_list)
        }
    }else if (output_type == "save") {
        setwd(output_directory)
        if (keep_sequence) {
            save(list = c("document_term_vector_list","input"),
                 file = paste(output_name,".Rdata", sep = ""))
        } else {
            save(list = c("document_term_vector_list",
                          "document_term_count_list",
                          "input"),
                 file = paste(output_name,".Rdata", sep = ""))
        }
    } else {
        setwd(output_directory)
        if (keep_sequence) {
            return(document_term_vector_list)
            save(document_term_vector_list,
                 file = paste(output_name,".Rdata", sep = ""))
        } else {
            save(list = c("document_term_vector_list",
                          "document_term_count_list"),
                 file = paste(output_name,".Rdata", sep = ""))
            return_list <- list(
                document_term_vector_list = document_term_vector_list,
                document_term_count_list = document_term_count_list
            )
            return(return_list)
        }
    }
    setwd(current_directory)
}