Nothing
test_that("test dfm_lookup, issue #389", {
toks <- tokens(data_corpus_inaugural[1:5])
dict <- dictionary(list(Country = "united states",
HOR = c("House of Re*"),
law = c("law*", "constitution"),
freedom = c("free*", "libert*")))
expect_equal(featnames(dfm(tokens_lookup(toks, dictionary = dict), tolower = FALSE)),
c("Country", "HOR", "law", "freedom"))
# expect_error(dfm_lookup(dfm(toks), dictionary = dict),
# "dfm_lookup not implemented for ngrams > 1 and multi-word dictionary values")
dict2 <- dictionary(list(Country = "united",
HOR = c("House"),
law = c("law*", "constitution"),
freedom = c("free*", "libert*")))
expect_equal(
as.numeric(dfm_lookup(dfm(toks), dictionary = dict2)[, "Country"]),
c(4, 1, 3, 0, 1)
)
})
test_that("#459 apply a hierarchical dictionary to a dfm", {
txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
d2 = "The Supreme Court of the United States is seldom in a united state.")
testdfm <- dfm(tokens(txt))
dict <- dictionary(list("geo" = list(
Countries = c("States"),
oceans = c("Atlantic", "Pacific")),
"other" = list(
gameconsoles = c("Xbox", "Nintendo"),
swords = c("States"))))
expect_equal(
as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 1)),
matrix(c(3, 1, 1, 1), ncol = 2, dimnames = list(docs = c("d1", "d2"),
features = c("geo", "other")))
)
expect_equal(
as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 1:2)),
matrix(c(1, 1, 2, 0, 0, 0, 1, 1), ncol = 4,
dimnames = list(docs = c("d1", "d2"),
features = c("geo.Countries", "geo.oceans", "other.gameconsoles", "other.swords")))
)
expect_equal(
as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 2)),
matrix(c(1, 1, 2, 0, 0, 0, 1, 1), ncol = 4,
dimnames = list(docs = c("d1", "d2"),
features = c("Countries", "oceans", "gameconsoles", "swords")))
)
})
test_that("#459 extract the lower levels of a dictionary using a dfm", {
txt <- c(d1 = "The United States has the Atlantic Ocean and the Pacific Ocean.",
d2 = "Britain and Ireland have the Irish Sea and the English Channel.")
dict <- dictionary(list("US" = list(
Countries = c("States"),
oceans = c("Atlantic", "Pacific")),
"Europe" = list(
Countries = c("Britain", "Ireland"),
oceans = list(west = "Sea", east = "Channel"))))
testdfm <- dfm(tokens(txt))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 1)),
matrix(c(3, 0, 0, 4), nrow = 2,
dimnames = list(docs = c("d1", "d2"), features = c("US", "Europe"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 2)),
matrix(c(1, 2, 2, 2), nrow = 2,
dimnames = list(docs = c("d1", "d2"), features = c("Countries", "oceans"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 1:2)),
matrix(c(1, 0, 2, 0, 0, 2, 0, 2), nrow = 2,
dimnames = list(docs = c("d1", "d2"),
features = c("US.Countries", "US.oceans",
"Europe.Countries", "Europe.oceans"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 3)),
matrix(c(0, 1, 0, 1), nrow = 2,
dimnames = list(docs = c("d1", "d2"), features = c("west", "east"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(1, 3))),
matrix(c(3, 0, 0, 2, 0, 1, 0, 1), nrow = 2,
dimnames = list(docs = c("d1", "d2"),
features = c("US", "Europe", "Europe.west", "Europe.east"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(2, 3))),
matrix(c(1, 2, 2, 0, 0, 1, 0, 1), nrow = 2,
dimnames = list(docs = c("d1", "d2"),
features = c("Countries", "oceans", "oceans.west", "oceans.east"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(1, 4))),
matrix(c(3, 0, 0, 4), nrow = 2,
dimnames = list(docs = c("d1", "d2"), features = c("US", "Europe"))))
expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 4)),
matrix(numeric(), nrow = 2, ncol = 0,
dimnames = list(docs = c("d1", "d2"), features = NULL)))
})
test_that("dfm_lookup raises error when dictionary has multi-word entries", {
toks <- tokens(data_corpus_inaugural[1:5])
dict <- dictionary(list(Country = "united states"), separator = " ")
expect_equal(
featnames(dfm_lookup(dfm(tokens_ngrams(toks, n = 2, concatenator = " ")), dictionary = dict)),
c("Country")
)
})
test_that("dfm_lookup works with multi-word keys, issue #704", {
dict <- dictionary(list("en" = list("foreign policy" = "foreign", "domestic politics" = "domestic")))
testdfm <- dfm(tokens(data_corpus_inaugural[1:5]))
expect_equal(featnames(dfm_lookup(testdfm, dict)),
c("en.foreign policy", "en.domestic politics"))
})
test_that("dfm_lookup return dfm even if no matches, issue #704", {
dict <- dictionary(list("en" = list("foreign policy" = "aaaaa", "domestic politics" = "bbbbb")))
dfmt <- dfm(tokens(data_corpus_inaugural[1:5]))
expect_identical(
featnames(dfm_lookup(dfmt, dict)),
c("en.foreign policy", "en.domestic politics")
)
expect_identical(
colSums(dfm_lookup(dfmt, dict)),
c("en.foreign policy" = 0, "en.domestic politics" = 0)
)
})
test_that("dfm_lookup return all features even if no matches when exclusive = FALSE, issue #116", {
dict <- dictionary(list("en" = list("foreign policy" = "aaaaa",
"domestic politics" = "bbbbb")))
testdfm <- dfm(tokens(data_corpus_inaugural[1:5]))
expect_equivalent(testdfm, dfm_lookup(testdfm, dict, exclusive = FALSE))
})
test_that("dfm_lookup verbose output works correctly", {
expect_message(
dfm_lookup(dfm(tokens(c(d1 = "a b c d", d2 = "c d e f g"))),
dictionary(list(one = "a", two = c("d", "e"))), verbose = TRUE),
"applying a dictionary consisting of 2 keys"
)
expect_silent(
dfm_lookup(dfm(tokens(c(d1 = "a b c d", d2 = "c d e f g"))),
dictionary(list(one = "a", two = c("d", "e"))), verbose = FALSE)
)
})
test_that("dfm_lookup with nomatch works", {
txt <- c(d1 = "a c d d", d2 = "a a b c c c e f")
dfm1 <- dfm(tokens(txt))
dict <- dictionary(list(one = c("a", "b"), two = c("e", "f")))
expect_equal(
as.matrix(dfm_lookup(dfm1, dict)),
as.matrix(dfm_lookup(dfm1, dict, nomatch = "_unmatched"))[, 1:2]
)
expect_equal(
as.matrix(dfm_lookup(dfm1, dict, nomatch = "_unmatched")),
matrix(c(1, 3, 0, 2, 3, 3), nrow = 2, dimnames = list(docs = c("d1", "d2"),
features = c("one", "two", "_unmatched")))
)
expect_warning(
dfm_lookup(dfm1, dict, nomatch = "ANYTHING", exclusive = FALSE),
"nomatch only applies if exclusive = TRUE"
)
})
test_that("dfm_lookup works with exclusive = TRUE, #958", {
txt <- c("word word2 document documents documenting",
"use using word word2")
dict <- dictionary(list(
document = "document*",
use = c("use", "using")
))
mx <- dfm(tokens(txt))
expect_equal(
as.matrix(dfm_lookup(mx, dict, exclusive = TRUE)),
matrix(c(3, 0, 0, 2), ncol = 2, dimnames = list(docs = c("text1", "text2"),
features = c("document", "use")))
)
expect_equal(
as.matrix(dfm_lookup(mx, dict, exclusive = FALSE, capkeys = TRUE)),
matrix(c(1, 1, 1, 1, 3, 0, 0, 2), ncol = 4,
dimnames = list(docs = c("text1", "text2"), features = c("word", "word2", "DOCUMENT", "USE")))
)
expect_equal(
as.matrix(dfm_lookup(mx, dict, exclusive = FALSE, capkeys = FALSE)),
matrix(c(1, 1, 1, 1, 3, 0, 0, 2), ncol = 4,
dimnames = list(docs = c("text1", "text2"),
features = c("word", "word2", "document", "use")))
)
})
test_that("dfm_lookup works with zero count features, #958", {
dict <- dictionary(list(A = "aa", B = "bb", D = "dd"))
mx1 <- as.dfm(matrix(c(0, 0, 0, 0, 1, 2), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("aa", "bb", "cc"))))
mx2 <- as.dfm(matrix(c(4, 5, 0, 0, 0, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("aa", "bb", "cc"))))
expect_equal(as.matrix(dfm_lookup(mx1, dict, exclusive = TRUE)),
matrix(c(0, 0, 0, 0, 0, 0), nrow = 2,
dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "D"))))
expect_equal(as.matrix(dfm_lookup(mx2, dict, exclusive = TRUE)),
matrix(c(4, 5, 0, 0, 0, 0), nrow = 2,
dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "D"))))
expect_equal(as.matrix(dfm_lookup(mx1, dict, exclusive = FALSE)),
matrix(c(0, 0, 0, 0, 1, 2), nrow = 2,
dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "cc"))))
expect_equal(as.matrix(dfm_lookup(mx2, dict, exclusive = FALSE)),
matrix(c(4, 5, 0, 0, 0, 0), nrow = 2,
dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "cc"))))
})
test_that("dfm_lookup works on a weighted dfm", {
dict <- dictionary(list(first = LETTERS[1:5], second = LETTERS[6:10]))
d <- dfm_weight(data_dfm_lbgexample, "prop")
expect_equal(
colSums(dfm_lookup(d, dictionary = dict)),
c(first = 0.082, second = .740),
tol = .001
)
})
test_that("dfm_lookup with nomatch works with key that do not appear in the text, #1347", {
txt <- c("12032 Musgrave rd red hill",
"13 rad street windermore park queensland",
"130 right road",
"130 rtn road")
toks <- tokens(txt)
dict <- dictionary(list(CR = c("rd", "red"),
CB = c("street", "feet"),
CA = c("parl", "dark"))) # CA does not appear at all
dfm_dict <- dfm_lookup(dfm(toks), dict, nomatch = "NONE")
expect_identical(as.matrix(dfm_dict),
matrix(c(2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3),
nrow = 4, dimnames = list(docs = c("text1", "text2", "text3", "text4"),
features = c("CR", "CB", "CA", "NONE"))))
})
test_that("dfm_lookup works exclusive = TRUE and FALSE, #970", {
dfmat <- dfm(tokens("say good bye to Hollywood"), tolower = FALSE)
dict <- dictionary(list(pos = "good", farewell = "bye"))
dfmat_ex <- dfm_lookup(dfmat, dict, exclusive = TRUE)
expect_true(dfmat_ex@meta$object$what == "dictionary")
expect_equal(featnames(dfmat_ex), c("pos", "farewell"))
dfmat_ne <- dfm_lookup(dfmat, dict, exclusive = FALSE)
expect_true(dfmat_ne@meta$object$what == "word")
expect_equal(featnames(dfmat_ne), c("say", "POS", "FAREWELL", "to", "Hollywood"))
})
test_that("dfm_lookup handle nested patterns correctly, #2159", {
dict <- dictionary(list(irish = c("ire*", "ireland", "irish"),
anger = c("ire*", "mad")))
toks <- tokens("I am mad about Ireland")
dfmat <- dfm(toks)
expect_equal(
as.matrix(dfm_lookup(dfmat, dict, exclusive = TRUE)),
matrix(c(1, 2), nrow = 1, dimnames = list(docs = "text1", features = c("irish", "anger")))
)
expect_equal(
as.matrix(dfm_lookup(dfmat, dict, exclusive = FALSE)),
matrix(c(1, 1, 1, 2, 1), nrow = 1,
dimnames = list(docs = "text1",
features = c("i", "am", "IRISH", "ANGER", "about")))
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.