Nothing
test_that("dtm_stats returns expect output", {
# simplified, base R dtm
out <- dtm_stats(dtm.bse,
richness = TRUE,
distribution = TRUE,
central = TRUE,
character = TRUE,
simplify = TRUE
)
expect_equal(dim(out), as.integer(c(1L, 26L)))
expect_type(out, "list")
expect_equal(unlist(out$n_docs), nrow(dtm.bse))
expect_equal(unlist(out$n_types), ncol(dtm.bse))
expect_equal(unlist(out$n_tokens), sum(dtm.bse))
# not simplified, base R dtm
out <- dtm_stats(dtm.bse)
expect_equal(length(out), 5L)
expect_equal(dim(out[[1]]), as.integer(c(5L, 2L)))
expect_equal(dim(out[[2]]), as.integer(c(4L, 2L)))
expect_equal(dim(out[[3]]), as.integer(c(10L, 2L)))
expect_equal(dim(out[[4]]), as.integer(c(4L, 2L)))
expect_equal(dim(out[[5]]), as.integer(c(3L, 2L)))
expect_type(out, "list")
expect_equal(as.character(out[[1]][1, 2]), as.character(nrow(dtm.bse)))
expect_equal(as.character(out[[1]][3, 2]), as.character(ncol(dtm.bse)))
expect_equal(as.character(out[[1]][4, 2]), as.character(sum(dtm.bse)))
# simplified, dgCMatrix R dtm
out <- dtm_stats(dtm.dgc,
richness = TRUE,
distribution = TRUE,
central = TRUE,
character = TRUE,
simplify = TRUE
)
expect_equal(dim(out), as.integer(c(1L, 26L)))
expect_type(out, "list")
expect_equal(unlist(out$n_docs), nrow(dtm.bse))
expect_equal(unlist(out$n_types), ncol(dtm.bse))
expect_equal(unlist(out$n_tokens), sum(dtm.bse))
# not simplified, dgCMatrix R dtm
out <- dtm_stats(dtm.dgc)
expect_equal(dim(out[[1]]), as.integer(c(5L, 2L)))
expect_equal(dim(out[[2]]), as.integer(c(4L, 2L)))
expect_equal(dim(out[[3]]), as.integer(c(10L, 2L)))
expect_equal(dim(out[[4]]), as.integer(c(4L, 2L)))
expect_equal(dim(out[[5]]), as.integer(c(3L, 2L)))
expect_type(out, "list")
expect_equal(as.character(out[[1]][1, 2]), as.character(nrow(dtm.bse)))
expect_equal(as.character(out[[1]][3, 2]), as.character(ncol(dtm.bse)))
expect_equal(as.character(out[[1]][4, 2]), as.character(sum(dtm.bse)))
})
test_that("dtm_stats returns basic output alone", {
out <- dtm_stats(
dtm = dtm.bse,
richness = FALSE,
distribution = FALSE,
central = FALSE,
character = FALSE,
simplify = FALSE
)
expect_type(out, "list")
expect_equal(length(out), as.integer(1L))
expect_equal(dim(out[[1]]), as.integer(c(5L, 2L)))
})
test_that("dtm_melter works on both base and sparse", {
out_a <- dtm_melter(dtm = dtm.bse)
out_b <- dtm_melter(dtm = dtm.dgc)
expect_type(out_a, "list")
expect_type(out_b, "list")
expect_equal(ncol(out_a), as.integer(3L))
expect_equal(ncol(out_b), as.integer(3L))
expect_equal(out_a, out_b)
expect_equal(sum(out_a$freq), sum(dtm.bse))
expect_equal(sum(out_b$freq), sum(dtm.dgc))
})
test_that("dtm_builder produces identical dtm to cast_dtm", {
# example 1
dtm.a <- my_corpus %>%
dtm_builder(clean_text, line_id)
# example 2
dtm.b <- dtm_builder(my_corpus, text = clean_text, doc_id = line_id)
# example 3
dtm.c <- my_corpus %>%
dplyr::mutate(
clean_text = gsub("'", "", text),
clean_text = tolower(clean_text)
) %>%
dtm_builder(clean_text, line_id)
# compare to tidy
dtm.tidy <- my_corpus %>%
tidytext::unnest_tokens(word, clean_text) %>%
dplyr::count(line_id, word, sort = TRUE) %>%
tidytext::cast_dtm(line_id, word, n)
expect_identical(dim(dtm.a), dim(dtm.b))
expect_identical(dim(dtm.a), dim(dtm.c))
expect_identical(dim(dtm.b), dim(dtm.c))
expect_identical(dim(dtm.a), dim(dtm.tidy))
expect_identical(sum(dtm.a), sum(dtm.b))
expect_identical(sum(dtm.a), sum(dtm.c))
expect_identical(sum(dtm.b), sum(dtm.c))
expect_identical(sum(dtm.a), sum(dtm.tidy))
expect_identical(
as.vector(colnames(dtm.a)),
as.vector(colnames(dtm.b))
)
expect_identical(
as.vector(colnames(dtm.a)),
as.vector(colnames(dtm.c))
)
expect_identical(
as.vector(colnames(dtm.b)),
as.vector(colnames(dtm.c))
)
expect_identical(
as.vector(sort(colnames(dtm.a))),
as.vector(sort(colnames(dtm.tidy)))
)
})
test_that("dtm_builder error/message if last row is blank", {
my_corpus <- data.frame(
my_text = c(
"I hear babies crying I watch them grow",
"They'll learn much more than I'll ever know",
"And I think to myself",
"What a wonderful world",
"Yes I think to myself",
"What a wonderful world"
),
line_id = paste0("line", seq_len(6))
)
my_corpus$my_text[6] <- ""
expect_error(
expect_message(dtm_builder(my_corpus, my_text, line_id))
)
})
test_that("dtm_builder works with vocab", {
vocab <- vocab_builder(corpus, text)
new.vocab <- vocab[!vocab %in% c("moon")]
expect_identical(
dim(dtm_builder(corpus, text, doc_id,
vocab = new.vocab
)),
as.integer(c(10, 43))
)
expect_identical(
dim(dtm_builder(corpus, text, doc_id,
vocab = new.vocab, chunk = 4L
)),
as.integer(c(19, 43))
)
expect_identical(
dim(dtm_builder(corpus, text, doc_id,
vocab = new.vocab
)),
as.integer(c(10, 43))
)
expect_error(
expect_message(
dtm_builder(corpus, text, doc_ids,
vocab = c(
"hear", "babies",
"world", "picklespit"
)
)
)
)
expect_error(
expect_message(
dtm_builder(corpus, text, doc_ids,
vocab = new.vocab, chunk = 5L
)
)
)
expect_error(
expect_message(
dtm_builder(corpus, text, doc_ids,
vocab = new.vocab
)
)
)
})
test_that("dtm_builder error/message if doc_id is wrong...", {
my_corpus <- data.frame(
my_text = c(
"I hear babies crying I watch them grow",
"They'll learn much more than I'll ever know",
"And I think to myself",
"What a wonderful world",
"Yes I think to myself",
"What a wonderful world"
),
line_id = paste0("line", seq_len(6))
)
expect_error(
expect_message(
dtm_builder(my_corpus, my_text, doc_id = line_ids)
)
)
})
test_that("dtm_builder chunks correctly", {
chunk <- 3L
dtm.e <- corpus %>%
dtm_builder(text, doc_id, chunk = chunk)
expect_equal(sum(dtm.e[1, ]), chunk)
dtm.f <- corpus %>%
dtm_builder(text, doc_id)
expect_equal(sum(dtm.e), sum(dtm.f))
chunk <- 100L
dtm.g <- corpus %>%
dtm_builder(text, doc_id, chunk = chunk)
expect_equal(sum(dtm.g[1, ]), sum(dtm.f))
})
test_that("dtm resampler creates DTM of the same dimensions", {
out <- dtm_resampler(dtm.dgc, alpha = 0.2)
expect_s4_class(out, "dgCMatrix")
expect_identical(dim(out), dim(dtm.dgc))
expect_equal(sum(out), sum(dtm.dgc) * 0.2, tolerance = 0.1)
out <- dtm_resampler(dtm.dgc, alpha = 0.5)
expect_s4_class(out, "dgCMatrix")
expect_identical(dim(out), dim(dtm.dgc))
expect_equal(sum(out), sum(dtm.dgc) * 0.5, tolerance = 0.1)
out <- dtm_resampler(dtm.dgc, alpha = 0.7)
expect_s4_class(out, "dgCMatrix")
expect_identical(dim(out), dim(dtm.dgc))
expect_equal(sum(out), sum(dtm.dgc) * 0.7, tolerance = 0.1)
out <- dtm_resampler(dtm.dgc, alpha = 1)
expect_s4_class(out, "dgCMatrix")
expect_identical(dim(out), dim(dtm.dgc))
expect_equal(sum(out), sum(dtm.dgc) * 1, tolerance = 0.1)
out <- dtm_resampler(dtm.dgc, n = 20)
expect_s4_class(out, "dgCMatrix")
expect_true(all(rowSums(out) == 20))
out <- dtm_resampler(dtm.dgc)
expect_s4_class(out, "dgCMatrix")
expect_true(all(rowSums(out) == rowSums(dtm.dgc)))
expect_equal(sum(out), sum(dtm.dgc))
expect_warning(dtm_resampler(dtm.dgc, alpha = 0.5, n = 20))
})
# tests for internal functions
test_that(".convert_mat_to_dgCMatrix convert all dtms to dgCMatrix", {
## base R matrix ##
expect_s4_class(.convert_mat_to_dgCMatrix(dtm.bse), "dgCMatrix")
## dgCMatrix matrix ##
expect_s4_class(.convert_mat_to_dgCMatrix(dtm.dgc), "dgCMatrix")
## dfm//dgCMatrix matrix ##
expect_s4_class(.convert_mat_to_dgCMatrix(dtm.dfm), "dgCMatrix")
## tm//simple_triplet_matrix matrix ##
expect_s4_class(.convert_mat_to_dgCMatrix(dtm.tm), "dgCMatrix")
## TermDocumentMatrix // tm//simple_triplet_matrix matrix ##
expect_s4_class(.convert_mat_to_dgCMatrix(dtm.tdm), "dgCMatrix")
})
test_that("dtm resampler works on output of .prep_cmd_INPUT", {
## dgCMatrix matrix ##
out <- .prep_cmd_INPUT(
dtm = dtm.dgc[, seq_len(35)],
cw = cw.oov,
cv = NULL,
wv = fake_word_vectors_oov,
missing = "stop"
)
dtm.samp <- dtm_resampler(out$DTM, alpha = 1)
expect_s4_class(dtm.samp, "dgCMatrix")
expect_equal(nrow(dtm.samp), nrow(out$DTM))
expect_equal(ncol(dtm.samp), ncol(out$DTM))
})
test_that("compare dtm dense and sparse", {
dtm_sp <- corpus %>%
dtm_builder(text, doc_id, dense = FALSE)
dtm_de <- corpus %>%
dtm_builder(text, doc_id, dense = TRUE)
expect_s4_class(dtm_sp, "dgCMatrix")
expect_identical(
class(dtm_de),
c("matrix", "array")
)
expect_equal(nrow(dtm_sp), nrow(dtm_de))
expect_equal(ncol(dtm_sp), ncol(dtm_de))
# compare with vocab
vocab <- vocab_builder(corpus, text)
new.vocab <- vocab[!vocab %in% c("moon")]
dtm_sp <- dtm_builder(corpus, text, doc_id,
vocab = new.vocab,
dense = FALSE
)
dtm_de <- dtm_builder(corpus, text, doc_id,
vocab = new.vocab,
dense = TRUE
)
expect_s4_class(dtm_sp, "dgCMatrix")
expect_identical(
class(dtm_de),
c("matrix", "array")
)
expect_equal(nrow(dtm_sp), nrow(dtm_de))
expect_equal(ncol(dtm_sp), ncol(dtm_de))
})
test_that("works w/o doc_id", {
dtm1 <- corpus %>%
dtm_builder(text, doc_id, dense = FALSE)
dtm2 <- corpus %>%
dtm_builder(text, dense = FALSE)
expect_equal(nrow(dtm1), nrow(dtm2))
expect_equal(ncol(dtm1), ncol(dtm2))
})
test_that("dtm functions with one document", {
dtm1 <- dtm_builder(corpus[1, ], text, doc_id, dense = FALSE)
expect_identical(dim(dtm1), as.integer(c(1, 6)))
dtm2 <- dtm_stopper(dtm1, stop_list = c("we", "moon"))
expect_identical(dim(dtm2), as.integer(c(1, 4)))
expect_identical(
dim(dtm_melter(dtm1)),
as.integer(c(6, 3))
)
dtm_c <- dtm_builder(corpus[1, ], text, chunk = 1L, dense = FALSE)
expect_identical(dim(dtm_c), as.integer(c(7, 6)))
expect_identical(
dim(dtm_melter(dtm_c)),
as.integer(c(7, 3))
)
})
test_that("seq_builder", {
out1 <- seq_builder(my_corpus, text, line_id)
out2 <- seq_builder(my_corpus,
text = text,
doc_id = line_id,
vocab = vocab_builder(my_corpus, text)
)
out3 <- seq_builder(my_corpus, text, line_id, maxlen = 5L)
out4 <- seq_builder(my_corpus, text, line_id, matrix = FALSE)
out5 <- seq_builder(my_corpus,
text = text,
doc_id = line_id,
maxlen = 5L,
vocab = vocab_builder(my_corpus, text)
)
out6 <- seq_builder(my_corpus, text)
expect_identical(dim(out1), c(6L, 8L))
expect_identical(dim(out3), c(6L, 5L))
expect_identical(length(out4), 6L)
expect_identical(length(out4$line1), 8L)
expect_identical(dim(out5), c(6L, 5L))
expect_identical(dim(out6), c(6L, 8L))
expect_equal(
length(attr(out1, "dic")), max(out1)
)
expect_equal(
length(attr(out2, "dic")), max(out2)
)
expect_equal(
length(attr(out3, "dic")), max(out3)
)
expect_equal(
length(attr(out4, "dic")), max(unlist(out4))
)
expect_identical(out1, out2)
expect_identical(
rownames(out6),
c(
"doc_1", "doc_2", "doc_3",
"doc_4", "doc_5", "doc_6"
)
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.