R/generate_blocked_document_term_vectors.R

Defines functions generate_blocked_document_term_vectors

Documented in generate_blocked_document_term_vectors

#' A function to generate and save blocks of document term vectors to coherently named files from a variety of inputs.
#'
#' @param input A list of strings, term vectors, raw documents, or csv files you wish to turn into document term vectors.
#' @param output_stem The the stem of the file name we wish to give each block of output document term vector list objects generated by this function.
#' @param data_directory Argument specifying where the data is stored.
#' @param output_directory Optional directory to store blocked document term
#' vector output.
#' @param block_size THe number of documents to group together in a ingle block of text to save. Defaults to 100.
#' @param data_type The type of data provided to the function.
#' @param ngram_type The type of ngram we wish to use to generate document term
#' vectors. Can be one of ngrams "jk_filtered", "verb_filtered", "phrases", or
#' any of "x_grams" where x is a number specifying the n_gram length. Can only be
#' used with input generated by the ngrams() function.
#' @param tokenization_method Currently not available.
#' @param csv_separator Defaults to "," but can be set to "*backslash*t" for tab separated values.
#' @param csv_word_column  If you are providing one csv file per document, then you must specify the index of the column that contains the words. Defaults to NULL.
#' @param csv_count_column For memory efficiency, you may want to store only the counts of unique words in csv files. If your data include counts, then you must specify the index of the column that contains the counts. Defaults to NULL.
#' @param csv_header Logical indicating whether the csv files provided have a header. Defaults to FALSE.
#' @param keep_sequence Logical indicating whether document term vectors should be condensed and counts (FALSE) or whether the full sequence should be maintained for storage (TRUE). Defaults to FALSE as this can be a much more memory efficient representation.
#' @return Saves blocks of text to file.
#' @export
generate_blocked_document_term_vectors <- function(
    input,
    output_stem,
    data_directory,
    output_directory = NULL,
    block_size = 100,
    data_type = c("string","term vector","raw text","csv","ngrams"),
    ngram_type = NULL,
    tokenization_method = c("RegEx"),
    csv_separator = ",",
    csv_word_column = NULL,
    csv_count_column = NULL,
    csv_header = FALSE,
    keep_sequence = FALSE){

    # determine the number of blocks
    num_blocks <- ceiling(length(input)/block_size)

    # if we are using ngrams, it will already be in blocks
    if (data_type == "ngrams") {
        num_blocks <- length(input)
        block_size <- 1
    }

    # generate a list object containign the relevant block information
    max_index <- length(input)
    start <- 1
    end <- block_size
    block_information <- vector(mode = "list",length = num_blocks)
    for (i in 1:num_blocks) {
        block_information[[i]]$input <- input[start:end]
        block_information[[i]]$output_name <- paste(output_stem,"_",i,sep = "")
        start <- end + 1
        end <- end + block_size
        if (end > max_index) {
            end <- max_index
        }
    }

    # now we generate the blocks
    for (i in 1:num_blocks) {
        cat("Currently generating block",i,"of",num_blocks,"\n")
        generate_document_term_vectors(
            input = block_information[[i]]$input,
            data_type = data_type,
            ngram_type = ngram_type,
            data_directory = data_directory,
            tokenization_method = tokenization_method,
            output_type = "save",
            output_name = block_information[[i]]$output_name,
            output_directory = output_directory,
            csv_separator = csv_separator,
            csv_word_column = csv_word_column,
            csv_count_column = csv_count_column,
            csv_header = csv_header,
            keep_sequence = keep_sequence)
    }
}
matthewjdenny/SpeedReader documentation built on March 25, 2020, 5:32 p.m.