SpeedReader: High Performance Text Analysis

library(SpeedReader)
context("Generate Document Term Vectors")

test_that("We are reading in the right number of ", {
    files <- get_file_paths(source = "bill tsvs")
    cat("\n")
    system.time({documents <- generate_document_term_vectors(
        input = files,
        data_type = "csv",
        csv_separator = "\t",
        csv_word_column = 1,
        csv_count_column = 2,
        csv_header = TRUE,
        keep_sequence = FALSE)})

    # save this stuff as example data
    # document_term_vector_list <- documents[[1]]
    # document_term_count_list <- documents[[2]]
    # devtools::use_data(document_term_vector_list)
    # devtools::use_data(document_term_count_list)

    expect_equal(69825, sum(unlist(sapply(documents[[2]],sum))))

    #now provide input as a vetor of strings:
    docs <- rep("One of the most common things we might want to do is read in and clean a raw input text file. To do this, we will want to make use of two functions, the first of these will clean and individual string, removing any characters that are not letters, lowercasing everything, and getting rid of additional spaces between words before tokenizing the resulting text and retur12ning a 12345667 vector of indiv!!idual words:",10)
    system.time({documents <- generate_document_term_vectors(
        input = docs,
        data_type = "string",
        tokenization_method = "RegEx",
        keep_sequence = FALSE)})
    str <- clean_document_text(text = docs[1])

    expect_equal(length(unique(str)), length(documents$document_term_count_list[[1]]))

    # now try with term vectors
    system.time({documents2 <- generate_document_term_vectors(
        input = documents$document_term_vector_list,
        data_type = "term vector",
        keep_sequence = FALSE)})

    system.time({documents3 <- generate_document_term_vectors(
        input = documents$document_term_vector_list,
        data_type = "term vector",
        keep_sequence = TRUE)})

    expect_equal(documents2$document_term_vector_list,documents$document_term_vector_list)
    expect_equal(documents3,documents$document_term_vector_list)

    #now try with raw text:
    files <- get_file_paths(source = "raw text")
    cat("\n")
    system.time({documents4 <- generate_document_term_vectors(
        input = files,
        data_type = "raw text",
        tokenization_method = "RegEx",
        keep_sequence = FALSE)})

})

matthewjdenny/SpeedReader documentation built on March 25, 2020, 5:32 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

matthewjdenny/SpeedReader
High Performance Text Analysis

tests/testthat/test_generate_document_term_vectors.R
In matthewjdenny/SpeedReader: High Performance Text Analysis

R Package Documentation

Browse R Packages

We want your feedback!

matthewjdenny/SpeedReader High Performance Text Analysis

tests/testthat/test_generate_document_term_vectors.R In matthewjdenny/SpeedReader: High Performance Text Analysis

R Package Documentation

Browse R Packages

We want your feedback!

matthewjdenny/SpeedReader
High Performance Text Analysis

tests/testthat/test_generate_document_term_vectors.R
In matthewjdenny/SpeedReader: High Performance Text Analysis