quanteda: Quantitative Analysis of Textual Data

txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
         d2 = "The Supreme Court of the United States is seldom in a united state.",
         d3 = "It's Arsenal versus Manchester United, states the announcer.",
         d4 = "We need Manchester Unity in the Federal Republic of Germany today.",
         d5 = "United statehood is a good state.",
         d6 = "luv the united states XXOO!")
toks <- tokens(txt, remove_punct = TRUE)

test_that("multi-word dictionary keys are counted correctly", {
    
    dict_mw_fixed <- dictionary(list(Countries = c("United States", "Federal Republic of Germany"),
                                     oceans = c("Atlantic Ocean", "Pacific Ocean"),
                                     Institutions = c("federal government", "Supreme Court"),
                                     team = c("Manchester United", "Arsenal")), tolower = FALSE)
    tokens_case_asis <- 
        tokens_lookup(toks, dict_mw_fixed, valuetype = "fixed", case_insensitive = FALSE)
    
    dfm_case_asis <- dfm(tokens_case_asis, tolower = FALSE)
    expect_equal(as.vector(dfm_case_asis[, "Countries"]), c(1, 1, 0, 1, 0, 0))
    expect_equal(as.vector(dfm_case_asis[, "team"]), c(0, 0, 2, 0, 0, 0))
    
    expect_equal(as.vector(dfm_case_asis["d3", "team"]), 2)
    # note the overlap of Manchester United states in d3
    expect_equal(as.vector(dfm_case_asis["d3", "Countries"]), 0)
    
    tokens_case_ignore <- 
        tokens_lookup(toks, dict_mw_fixed, valuetype = "fixed", case_insensitive = TRUE)
    dfm_case_ignore <- dfm(tokens_case_ignore, tolower = FALSE)
    expect_equal(as.vector(dfm_case_ignore[, "Countries"]), c(1, 1, 1, 1, 0, 1))
    
    expect_equal(as.vector(dfm_case_ignore["d3", "team"]), 2)
    # note the overlap of Manchester United states in d3
    expect_equal(as.vector(dfm_case_ignore["d3", "Countries"]), 1)
    
    dict_mw_glob <- dictionary(list(Countries = c("United States", "Federal * of *"),
                                    oceans = c("* Ocean"),
                                    Institutions = c("federal gover*", "Supreme Court"),
                                    team = c("Manchester *", "Arsenal")), tolower = FALSE)
    tokens_case_asis_glob <- 
        tokens_lookup(toks, dict_mw_glob, valuetype = "glob", case_insensitive = FALSE)
    
    dfm_case_asis_glob <- dfm(tokens_case_asis_glob, tolower = FALSE)
    expect_equal(as.vector(dfm_case_asis_glob[, "Countries"]), c(1, 1, 0, 1, 0, 0))
    expect_equal(as.vector(dfm_case_asis_glob[, "oceans"]), c(2, 0, 0, 0, 0, 0))
    expect_equal(as.vector(dfm_case_asis_glob[, "team"]), c(0, 0, 2, 1, 0, 0))
    
})

test_that("entirely single-word dictionary keys are counted correctly", {
    
    dict_sw_fixed <- dictionary(list(Countries = c("States", "Germany"),
                                     oceans = c("Atlantic", "Pacific"),
                                     Institutions = c("government", "Court"),
                                     team = c("Manchester", "Arsenal")), tolower = FALSE)    
    
    tokens_case_asis <- 
        tokens_lookup(toks, dict_sw_fixed, valuetype = "fixed", case_insensitive = FALSE)
    dfm_case_asis <- dfm(tokens_case_asis, tolower = FALSE)
    expect_equal(as.vector(dfm_case_asis[, "Countries"]), c(1, 1, 0, 1, 0, 0))
    expect_equal(as.vector(dfm_case_asis[, "team"]), c(0, 0, 2, 1, 0, 0))
    
    expect_equal(as.vector(dfm_case_asis["d3", "team"]), 2)
    # note the overlap of Manchester United states in d3
    expect_equal(as.vector(dfm_case_asis["d3", "Countries"]), 0)
    
    tokens_case_ignore <- 
        tokens_lookup(toks, dict_sw_fixed, valuetype = "fixed", case_insensitive = TRUE)
    dfm_case_ignore <- dfm(tokens_case_ignore, tolower = FALSE)
    expect_equal(as.vector(dfm_case_ignore[, "Countries"]), c(1, 1, 1, 1, 0, 1))
    
    expect_equal(as.vector(dfm_case_ignore["d3", "team"]), 2)
    expect_equal(as.vector(dfm_case_ignore["d3", "Countries"]), 1)
    
})


test_that("multi-word dictionary behavior is not sensitive to the order of dictionary entries", {
    
    txt <- c(d1 = "The United States is a country.", 
             d2 = "Arsenal v Manchester United, states the announcer.")
    toks <- tokens(txt, remove_punct = TRUE)
    dict1 <- dictionary(list(Countries = c("United States"),
                             team = c("Manchester United", "Arsenal")))
    dict2 <- dictionary(list(team = c("Arsenal", "Manchester United"),
                             Countries = c("United States")))
    expect_equal(
        as.list(tokens_lookup(toks, dictionary = dict1, valuetype = "fixed")),
        as.list(tokens_lookup(toks, dictionary = dict2, valuetype = "fixed"))
    )
    
})

test_that("#388 issue about overlapping key values is resolved: fixed matches", {
    txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
             d2 = "The Supreme Court of the United States is seldom in a united state.")
    toks <- tokens(txt)
    dict_fixed <- dictionary(list(Countries = c("States"),
                                  oceans = c("Atlantic", "Pacific"),
                                  gameconsoles = c("Xbox", "Nintendo"),
                                  swords = c("States")))
    
    expect_equal(as.list(tokens_lookup(toks, dict_fixed, valuetype = "fixed")),
                 list(d1 = c("Countries", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords")))
})

test_that("#388 issue about overlapping key values is resolved: glob matches", {
    txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
             d2 = "The Supreme Court of the United States is seldom in a united state.")
    toks <- tokens(txt)
    dict_glob <- dictionary(list(Countries = c("Stat*"),
                                 oceans = c("*ic"),
                                 gameconsoles = c("?box", "Nintendo*"),
                                 swords = "*s"), tolower = FALSE)
    
    expect_equal(as.list(tokens_lookup(toks, dict_glob, valuetype = "glob")),
                 list(d1 = c("Countries", "swords", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords", "swords", "Countries")))
    expect_equal(as.list(tokens_lookup(toks, dict_glob, valuetype = "glob", case_insensitive = FALSE)),
                 list(d1 = c("Countries", "swords", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords", "swords")))
})

test_that("#388 issue about overlapping key values is resolved: regex matches", {
    txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
             d2 = "The Supreme Court of the United States is seldom in a united state.")
    toks <- tokens(txt)
    dict_regex <- dictionary(list(Countries = c("Stat.*$"),
                                  oceans = c("[A-Z][a-z]+ic"),
                                  gameconsoles = c("Xbox"),
                                  swords = "s$"), tolower = FALSE)
    
    expect_equal(as.list(tokens_lookup(toks, dict_regex, valuetype = "regex")),
                 list(d1 = c("Countries", "swords", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords", "swords", "Countries")))
    expect_equal(as.list(tokens_lookup(toks, dict_regex, valuetype = "regex", case_insensitive = FALSE)),
                 list(d1 = c("Countries", "swords", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords", "swords")))
    
})

test_that("non-exclusive lookup is working",{
    
    toks <- tokens(c(d1 = "Mexico signed a new libertarian law with Canada.",
                     d2 = "Let freedom ring in the United States!",
                     d3 = "Aliens are invading Mars"),
                   remove_punct = TRUE)
    toks2 <- tokens_remove(toks, stopwords("en"), padding = TRUE)
    
    dict <- dictionary(list(country = c("united states", "mexico", "canada"), 
                            "law words" = c('law*', 'constitution'), 
                            freedom = c('free', "freedom", 'libertarian'),
                            overlap = "United"))
    
    expect_equal(as.list(tokens_lookup(toks, dict, exclusive = FALSE, capkeys = TRUE)),
                 list(d1 = c("COUNTRY", "signed", "a", "new", "FREEDOM", "LAW WORDS", "with", "COUNTRY"),
                      d2 = c("Let", "FREEDOM", "ring", "in", "the", "COUNTRY", "OVERLAP"),
                      d3 = c("Aliens", "are", "invading", "Mars")))
    
    expect_equal(as.list(tokens_lookup(toks2, dict, exclusive = FALSE, capkeys = TRUE)),
                 list(d1 = c("COUNTRY", "signed", "", "new", "FREEDOM", "LAW WORDS", "", "COUNTRY"),
                      d2 = c("Let", "FREEDOM", "ring", "", "", "COUNTRY", "OVERLAP"),
                      d3 = c("Aliens", "", "invading", "Mars")))
    
})

test_that("tokens_lookup preserves case on keys", {
    ## issue #393
    toks <- tokens(data_corpus_inaugural[1:5])
    dict <- dictionary(list(Country = "united states",
                            HOR = c("House of Re*")))
    expect_identical(featnames(dfm(tokens_lookup(toks, dict), tolower = FALSE)),
                     c("Country", "HOR"))
})

test_that("multi-word dictionary behavior is not affected by padding", {
    
    toks <- tokens(c(d1 = "Mexico signed a new libertarian law with Canada.",
                     d2 = "Let freedom ring in the United States!"),
                   remove_punct = TRUE)
    toks <- tokens(txt, remove_punct = TRUE)
    toks2 <- tokens_remove(toks, stopwords('english'), padding = TRUE)
    dict <- dictionary(list(country = c("united states", "mexico", "canada"), 
                            "law words" = c('law*', 'constitution'), 
                            freedom = c('free', "freedom", 'libertarian'),
                            overlap = "United"))
    expect_equal(
        as.list(tokens_lookup(toks, dictionary = dict, valuetype = "fixed")),
        as.list(tokens_lookup(toks2, dictionary = dict, valuetype = "fixed"))
    )
    
})


test_that("#459 apply a hierarchical dictionary", {
    
    txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.",
             d2 = "The Supreme Court of the United States is seldom in a united state.")
    toks <- tokens(txt)
    dict <- dictionary(list('geo'=list(
        Countries = c("States"),
        oceans = c("Atlantic", "Pacific")),
        'other'=list(
            gameconsoles = c("Xbox", "Nintendo"),
            swords = c("States"))), tolower = FALSE)
    
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "fixed", levels=1)),
                 list(d1 = c("geo", "other", "geo", "geo"),
                      d2 = c("geo", "other")))
    
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "fixed", levels=1:2)),
                 list(d1 = c("geo.Countries", "other.swords", "geo.oceans", "geo.oceans"),
                      d2 = c("geo.Countries", "other.swords")))
    
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "fixed", levels=2)),
                 list(d1 = c("Countries", "swords", "oceans", "oceans"),
                      d2 = c("Countries", "swords")))
})

test_that("#459 extract the lower levels of a dictionary using tokens_lookup", {
    txt <- c(d1 = "The United States has the Atlantic Ocean and the Pacific Ocean.",
             d2 = "Britain and Ireland have the Irish Sea and the English Channel.")
    toks <- tokens(txt)
    dict <- dictionary(list('US' = list(
                                    Countries = c("States"),
                                    oceans = c("Atlantic", "Pacific")),
                            'Europe' = list(
                                Countries = c("Britain", "Ireland"),
                                oceans = list(west = "Irish Sea", 
                                              east = "English Channel"))))

    expect_equal(as.list(tokens_lookup(toks, dict, levels = 1)),
                 list(d1 = c('US', 'US', 'US'), 
                      d2 = c('Europe', 'Europe', 'Europe', 'Europe')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = 2)),
                 list(d1 = c('Countries', 'oceans', 'oceans'), 
                      d2 = c('Countries', 'Countries', 'oceans', 'oceans')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = 1:2)),
                 list(d1 = c('US.Countries', 'US.oceans', 'US.oceans'), 
                      d2 = c('Europe.Countries', 'Europe.Countries', 'Europe.oceans', 'Europe.oceans')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = 3)),
                 list(d1 = character(), 
                      d2 = c('west', 'east')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = c(1,3))),
                 list(d1 = c('US', 'US', 'US'), 
                      d2 = c('Europe', 'Europe', 'Europe.west', 'Europe.east')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = c(2,3))),
                 list(d1 = c('Countries', 'oceans', 'oceans'), 
                      d2 = c('Countries', 'Countries', 'oceans.west', 'oceans.east')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = c(1, 4))),
                 list(d1 = c('US', 'US', 'US'), 
                      d2 = c('Europe', 'Europe', 'Europe', 'Europe')))
    expect_equal(as.list(tokens_lookup(toks, dict, levels = 4)),
                 list(d1 = character(), 
                      d2 = character()))
})

test_that("#480 reset padding flag", {
    
    toks <- tokens(data_corpus_inaugural[1:5])
    toks <- tokens_remove(toks, stopwords('english'), padding = TRUE)
    dict <- dictionary(list(Country = "united states",
                            HOR = c("House of Re*")))
    expect_false('' %in% featnames(dfm(tokens_lookup(toks, dict, exclusive = TRUE), tolower = FALSE)))
})


test_that("#500 tokens_lookup separates entry words by separator", {
    
    toks <- tokens(data_corpus_inaugural[1:5])
    dict <- dictionary(list(Country = "united_states",
                            HOR = c("House_of_Re*")), separator = '_')
    expect_identical(featnames(dfm(tokens_lookup(toks, dict), tolower = FALSE)),
                     c("Country", "HOR"))
})


# test_that("#500 tokens_lookup do not separate words when multiword = FALSE", {
#     toks <- as.tokens(list(d1 = c('United States', 'Atlantic Ocean', 'Pacific Ocean'),
#                            d2 = c('Supreme Court', 'United States')))
#     dict <- dictionary(list(Countries = c("United States"),
#                             oceans = c("Atlantic *", "Pacific *")))
#     
#     expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "glob", multiword = FALSE)),
#                  list(d1 = c("Countries", "oceans", "oceans"),
#                       d2 = c("Countries")))
# })

test_that("#500 tokens_lookup substitute concatenator", {
    toks <- as.tokens(list(d1 = c('United-States', 'Atlantic-Ocean', 'Pacific-Ocean'),
                           d2 = c('Supreme-Court', 'United-States')), concatenator = '-')
    dict <- dictionary(list(Countries = c("United_States"),
                            oceans = c("Atlantic_*", "Pacific_*")), separator = '_')
    
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "glob")),
                 list(d1 = c("Countries", "oceans", "oceans"),
                      d2 = c("Countries")))
})


test_that("#502 tokens_lookup count overlapped words", {
    
    txt <- c(d1 = "The United States of America is bordered by the Atlantic Ocean and the Pacific Ocean.",
             d2 = "The Supreme Court of the United States is seldom in a united state.")
    toks <- tokens(txt)
    dict <- dictionary(list(Countries = c("United States", "United States of America"),
                            oceans = c("Ocean")))
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "glob")),
                 list(d1 = c("Countries", "oceans", "oceans"),
                      d2 = c("Countries")))
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "fixed")),
                 list(d1 = c("Countries", "oceans", "oceans"),
                      d2 = c("Countries")))
    
    dict <- dictionary(list(Countries = c("United States", "Unit Stat of America"),
                            oceans = c("Ocean.*")))
    expect_equal(as.list(tokens_lookup(toks, dict, valuetype = "regex")),
                 list(d1 = c("Countries", "oceans", "oceans"),
                      d2 = c("Countries")))
    
    expect_equal(
        as.character(tokens_lookup(tokens("A B C"), dictionary(list(key = c("A B", "A B C"))))),
        c("key")
    )
    
    expect_equal(
        as.character(tokens_lookup(tokens("A B C"), dictionary(list(key = c("B C", "A B C"))))),
        c("key")
    )
})



test_that("tokens_lookup with nomatch works", {
    txts <- c(d1 = "a c d d", d2 = "a a b c c c e f")
    toks <- tokens(txts)
    dict <- dictionary(list(one = c("a", "b", "b c"), two = c("e", "f")))
    
    expect_equal(
        as.matrix(dfm(tokens_lookup(toks, dict))),
        as.matrix(dfm(tokens_lookup(toks, dict, nomatch = "_unmatched")))[, 1:2]
    )
    
    expect_equivalent(
        as.matrix(cbind("_unmatched" = ntoken(tokens_remove(toks, dict)))),
        as.matrix(dfm(tokens_lookup(toks, dict, nomatch = "_unmatched")))[, "_unmatched", drop = FALSE]
    )
    
    expect_equal(
        as.matrix(dfm(tokens_lookup(toks, dict, nomatch = "_unmatched"))),
        matrix(c(1,3,0,2,3,2), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("one", "two", "_unmatched")))
    )
    expect_warning(
        tokens_lookup(toks, dict, nomatch = "ANYTHING", exclusive = FALSE),
        "nomatch only applies if exclusive = TRUE"
    )
})

test_that("tokens_lookup works with exclusive = TRUE, #958", {
    
    txt <- c("word word2 document documents documenting",
                 "use using word word2")
    dict <- dictionary(list(
        document = "document*",
        use      = c("use", "using")
    ))
    
    toks <- tokens(txt)
    expect_equal(
        as.list(tokens_lookup(toks, dict, exclusive = FALSE, capkeys = FALSE)),
        list(text1 = c('word', 'word2', 'document', 'document', 'document'), 
             text2 = c('use', 'use', 'word', 'word2'))
    )
    expect_equal(
        as.list(tokens_lookup(toks, dict, exclusive = FALSE, capkeys = TRUE)),
        list(text1 = c('word', 'word2', 'DOCUMENT', 'DOCUMENT', 'DOCUMENT'), 
             text2 = c('USE', 'USE', 'word', 'word2'))
    )
})

test_that("tokens_lookup works in exclusive = TRUE and FALSE, #970", {
    
    toks <- tokens("say good bye to Hollywood")
    dict <- dictionary(list(pos = "good", farewell = "good bye"))
    
    toks_ex <- tokens_lookup(toks, dict, exclusive = TRUE)
    expect_true(attr(toks_ex, "meta")$object$what == "dictionary")
    expect_equal(as.list(toks_ex),
                 list(text1 = c("pos", "farewell")))
    
    toks_ne <- tokens_lookup(toks, dict, exclusive = FALSE)
    expect_true(attr(toks, "meta")$object$what == "word")
    expect_equal(as.list(toks_ne),
                 list(text1 = c("say", "POS", "FAREWELL", "to", "Hollywood")))

})

test_that("tokens_lookup works when exclusive = FALSE, #970", {
    
    dict <- dictionary(list(sequence1 = "a b", sequence2 = "x y", notseq = c("d", "e")))
    txt <- c(d1 = "a b c d e f g x y z",
             d2 = "a c d x z",
             d3 = "x y",
             d4 = "f g")
    toks <- tokens(txt)
    expect_equal(as.list(tokens_lookup(toks, dict, exclusive = FALSE)),
                 list(d1 = c("SEQUENCE1", "c", "NOTSEQ", "NOTSEQ", "f", "g", "SEQUENCE2", "z"), 
                      d2 = c("a","c", "NOTSEQ", "x", "z"),
                      d3 = c("SEQUENCE2"),
                      d4 = c("f", "g"))
                 )
    
})

test_that("tokens_lookup works when there is a key with non-existent values and when exclusive = FALSE, #1011", {
    
    dict <- dictionary(list(sequence1 = "a b", sequence2 = "x y", notseq = c("d", "e"), notexist = c("zzz")))
    txt <- c(d1 = "a b c d e f g x y z",
             d2 = "a c d x z",
             d3 = "x y",
             d4 = "f g")
    toks <- tokens(txt)
    expect_equal(as.list(tokens_lookup(toks, dict, exclusive = FALSE)),
                 list(d1 = c("SEQUENCE1", "c", "NOTSEQ", "NOTSEQ", "f", "g", "SEQUENCE2", "z"), 
                      d2 = c("a","c", "NOTSEQ", "x", "z"),
                      d3 = c("SEQUENCE2"),
                      d4 = c("f", "g"))
    )
    
})

test_that("tokens_lookup with nomatch works with key that do not appear in the text, #1347", {
    txt <- c("12032 Musgrave rd red hill", 
             "13 rad street windermore park queensland",
             "130 right road", 
             "130 rtn road")
    toks <- tokens(txt)
    dict <- dictionary(list(CR = c("rd", "red"), 
                            CB = c("street", "feet"), 
                            CA = c("parl", "dark"))) # CA does not appear at all
    toks_dict <- tokens_lookup(toks, dict, nomatch = "NONE")
    expect_identical(as.list(toks_dict), 
                      list(text1 = c("NONE", "NONE", "CR", "CR", "NONE"), 
                           text2 = c("NONE", "NONE", "CB", "NONE", "NONE", "NONE"), 
                           text3 = c("NONE", "NONE", "NONE"),
                           text4 = c("NONE", "NONE", "NONE")))
    expect_equivalent(unclass(toks_dict), 
                      list(c(4, 4, 1, 1, 4), c(4, 4, 2, 4, 4, 4), c(4, 4, 4), c(4, 4, 4)))
    expect_identical(types(toks_dict), c("CR", "CB", "CA", "NONE"))
})

test_that("nested_scope function is working", { 
    dict <- dictionary(list(
        'AS' = c("American Samoa", "American Samoan*", "Pago Pago"),
        'WS' = c("Samoa", "Samoan*", "Apia"),
        'VG' = c("British Virgin Islands", "Virgin Island*", "Road Town"),
        'GB' = c("UK", "United Kingdom", "Britain", "British", "Briton*", "Brit*", "London"),
        'US' = c("United States", "US", "American*", "Washington", "New York")
    ))
    txt <- c(
        'British Virgin Islands is a British overseas territory',
        'Samoa is an independent state',
        'American Samoa is in the South Pacific'
    )
    toks <- tokens(txt)
    expect_equal(
        as.list(tokens_lookup(toks, dict, nested_scope = "key")),
        list(text1 = c("VG", "GB", "GB"), 
             text2 = c("WS"), 
             text3 = c("AS", "US", "WS"))
    )
    expect_equal(
        as.list(tokens_lookup(toks, dict, nested_scope = "dictionary")),
        list(text1 = c("VG", "GB"), 
             text2 = c("WS"), 
             text3 = c("AS"))
    )
})

test_that("dictionary nested_scope is independent of orders", {
    toks1 <- tokens("Virgin Islands are near Dominica and the Dominican Republic")
    dict1 <- dictionary(list("VG" = "Virgin Islands",
                            "VI" = "Virgin Islands",
                            "DM" = "Dominica*",
                            "DO" = "Dominican Republic"))
    expect_equal(
        as.list(tokens_lookup(toks1, dict1, nested_scope = "dictionary")),
        list(text1 = c("VG", "VI", "DM", "DO"))
    )
    expect_equal(
        as.list(tokens_lookup(toks1, rev(dict1), nested_scope = "dictionary")),
        list(text1 = c("VI", "VG", "DM", "DO"))
    )
    
    toks2 <- tokens("Congolese are people in Republic of Congo or Democratic Republic of Congo")
    dict2 <- dictionary(list("CD" = c("Democratic Republic of Congo", "Congolese"),
                             "CG" = c("Republic of Congo", "Congolese")))
    expect_equal(
        as.list(tokens_lookup(toks2, dict2, nested_scope = "dictionary")),
        list(text1 = c("CD", "CG", "CG", "CD"))
    )
    expect_equal(
        as.list(tokens_lookup(toks2, rev(dict2), nested_scope = "dictionary")),
        list(text1 = c("CG", "CD", "CG", "CD"))
    )
})

test_that("tokens_lookup return tokens even if no matches", {
    dict <- dictionary(list("en" = list("foreign policy" = "aaaaa", 
                                        "domestic politics" = "bbbbb")))
    toks <- tokens(data_corpus_inaugural[1:5])
    expect_identical(
        types(tokens_lookup(toks, dict)),
        c("en.foreign policy", "en.domestic politics")
    )
    expect_identical(
        lengths(tokens_lookup(toks, dict)),
        c("1789-Washington" = 0L, "1793-Washington" = 0L, "1797-Adams" = 0L,
          "1801-Jefferson"  = 0L, "1805-Jefferson" = 0L)
    )
})


test_that("append_key is working",{
    
    toks <- tokens(c(d1 = "Mexico signed a new libertarian law with Canada.",
                     d2 = "Let freedom ring in the United States!",
                     d3 = "Aliens are invading Mars"),
                   remove_punct = TRUE)
    toks_pad <- tokens_remove(toks, stopwords("en"), padding = TRUE)
    
    dict <- dictionary(list(country = c("united states", "mexico", "canada"), 
                            "law words" = c('law*', 'constitution'), 
                            freedom = c('free', "freedom", 'libertarian'),
                            overlap = "United"))
    
    # exclusive mode
    toks_ex1 <- tokens_lookup(toks, dict, exclusive = TRUE, capkeys = TRUE, append_key = TRUE)
    expect_equal(as.list(toks_ex1),
                 list(d1 = c("Mexico/COUNTRY", "libertarian/FREEDOM", 
                             "law/LAW WORDS", "Canada/COUNTRY"),
                      d2 = c("freedom/FREEDOM", 
                             "United_States/COUNTRY", "United/OVERLAP"),
                      d3 = character()))
    expect_true(all(featfreq(dfm(toks_ex1)) > 0))
    
    toks_ex2 <- tokens_lookup(toks_pad, dict, exclusive = TRUE, capkeys = TRUE, append_key = TRUE)
    expect_equal(as.list(toks_ex2),
                 list(d1 = c("Mexico/COUNTRY", "libertarian/FREEDOM", 
                             "law/LAW WORDS", "Canada/COUNTRY"),
                      d2 = c("freedom/FREEDOM", 
                             "United_States/COUNTRY", "United/OVERLAP"),
                      d3 = character()))
    expect_true(all(featfreq(dfm(toks_ex2)) > 0))
    
    toks_ex3 <- tokens_lookup(toks_pad, dict, exclusive = TRUE, capkeys = TRUE,
                             append_key = TRUE, concatenator = "+")
    expect_equal(as.list(toks_ex3),
                 list(d1 = c("Mexico/COUNTRY", "libertarian/FREEDOM", 
                             "law/LAW WORDS", "Canada/COUNTRY"),
                      d2 = c("freedom/FREEDOM", 
                             "United+States/COUNTRY", "United/OVERLAP"),
                      d3 = character()))
    expect_true(all(featfreq(dfm(toks_ex3)) > 0))
    
    # non-exclusive mode
    toks_ne1 <- tokens_lookup(toks, dict, exclusive = FALSE, capkeys = TRUE,
                              append_key = TRUE)
    expect_equal(as.list(toks_ne1),
                 list(d1 = c("Mexico/COUNTRY", "signed", "a", "new", "libertarian/FREEDOM", 
                             "law/LAW WORDS", "with", "Canada/COUNTRY"),
                      d2 = c("Let", "freedom/FREEDOM", "ring", "in", "the", 
                             "United_States/COUNTRY", "United/OVERLAP"),
                      d3 = c("Aliens", "are", "invading", "Mars")))
    expect_true(all(featfreq(dfm(toks_ne1)) > 0))
    
    toks_ne2 <- tokens_lookup(toks, dict, exclusive = FALSE, capkeys = TRUE,
                              append_key = TRUE, separator = "+")
    expect_equal(as.list(toks_ne2),
                 list(d1 = c("Mexico+COUNTRY", "signed", "a", "new", "libertarian+FREEDOM", 
                             "law+LAW WORDS", "with", "Canada+COUNTRY"),
                      d2 = c("Let", "freedom+FREEDOM", "ring", "in", "the", 
                             "United_States+COUNTRY", "United+OVERLAP"),
                      d3 = c("Aliens", "are", "invading", "Mars")))
    expect_true(all(featfreq(dfm(toks_ne2)) > 0))
    
    toks_ne3 <- tokens_lookup(toks, dict, exclusive = FALSE, capkeys = TRUE, 
                              append_key = TRUE, separator = " x ")
    expect_equal(as.list(toks_ne3),
                 list(d1 = c("Mexico x COUNTRY", "signed", "a", "new", "libertarian x FREEDOM", 
                             "law x LAW WORDS", "with", "Canada x COUNTRY"),
                      d2 = c("Let", "freedom x FREEDOM", "ring", "in", "the", 
                             "United_States x COUNTRY", "United x OVERLAP"),
                      d3 = c("Aliens", "are", "invading", "Mars")))
    expect_true(all(featfreq(dfm(toks_ne3)) > 0))
    
    toks_ne4 <- tokens_lookup(toks_pad, dict, exclusive = FALSE, capkeys = FALSE, 
                              append_key = TRUE)
    expect_equal(as.list(toks_ne4),
                 list(d1 = c("Mexico/country", "signed", "", "new", "libertarian/freedom", 
                             "law/law words", "", "Canada/country"),
                      d2 = c("Let", "freedom/freedom", "ring", "", "", 
                             "United_States/country", "United/overlap"),
                      d3 = c("Aliens", "", "invading", "Mars")))
    expect_true(all(featfreq(dfm(toks_ne4)) > 0))
    
    toks_ne5 <- tokens_lookup(toks_pad, dict, exclusive = FALSE, capkeys = FALSE, 
                              append_key = TRUE, separator = "➡️")
    expect_equal(as.list(toks_ne5),
                 list(d1 = c("Mexico➡️country", "signed", "", "new", "libertarian➡️freedom", 
                             "law➡️law words", "", "Canada➡️country"),
                      d2 = c("Let", "freedom➡️freedom", "ring", "", "", 
                             "United_States➡️country", "United➡️overlap"),
                      d3 = c("Aliens", "", "invading", "Mars")))
    expect_true(all(featfreq(dfm(toks_ne5)) > 0))
    
    expect_error(
        tokens_lookup(toks, dict, exclusive = FALSE,  append_key = NA),
        "append_key cannot be NA"
    )
    expect_error(
        tokens_lookup(toks, dict, exclusive = FALSE, append_key = TRUE, 
                      separator = c("+", "+")),
        "The length of separator must be 1"
    )
    expect_error(
        tokens_lookup(toks, dict, exclusive = FALSE, append_key = TRUE, 
                      concatenator = c("_", "_")),
        "The length of concatenator must be 1"
    )
    
})

test_that("apply_if argument is working", {
    
    dat <- data.frame(text = c("R and C are languages",
                               "Windows (R), Quanteda (C)"),
                      topic = c("language", "software"))
    dict <- dictionary(list(language = c("C", "R")))
    corp <- corpus(dat)
    toks <- tokens(corp, remove_punct = TRUE) %>% 
        tokens_remove(stopwords())
    
    toks1 <- tokens_lookup(toks, dict)
    expect_identical(
        as.list(toks1),
        list(text1 = c("language", "language"),
             text2 = c("language", "language"))
    )
    
    toks2 <- tokens_lookup(toks, dict, apply_if = toks$topic == "language")
    expect_identical(
        as.list(toks2),
        list(text1 = c("language", "language"),
             text2 = character())
    )
    
    toks3 <- tokens_lookup(toks, dict, exclusive = FALSE, append_key = TRUE)
    expect_identical(
        as.list(toks3),
        list(text1 = c("R/LANGUAGE", "C/LANGUAGE", "languages"),
             text2 = c("Windows", "R/LANGUAGE" , "Quanteda", "C/LANGUAGE"))
    )
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda/quanteda
Quantitative Analysis of Textual Data

tests/testthat/test-tokens_lookup.R
In quanteda/quanteda: Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda Quantitative Analysis of Textual Data

tests/testthat/test-tokens_lookup.R In quanteda/quanteda: Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda
Quantitative Analysis of Textual Data

tests/testthat/test-tokens_lookup.R
In quanteda/quanteda: Quantitative Analysis of Textual Data