quanteda: Quantitative Analysis of Textual Data

test_that("tokens_compound join tokens correctly", {

    txt <- c("a b c d e f g", "A B C D E F G", "A b C d E f G",
             "aaa bbb ccc ddd eee fff ggg", "a_b b_c c_d d_e e_f f_g")
    toks <- tokens(txt)
    seqs <- as.list(tokens(c("a b", "C D", "aa* bb*", "eEE FFf", "d_e e_f"),
                    what = "fastestword"))
    expect_equivalent(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE)),
        list(c("a_b", "c_d", "e", "f", "g"),
             c("A_B", "C_D", "E", "F", "G"),
             c("A_b", "C_d", "E", "f", "G"),
             c("aaa_bbb", "ccc", "ddd", "eee_fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    expect_equivalent(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = FALSE)),
        list(c("a_b", "c", "d", "e", "f", "g"),
             c("A", "B", "C_D", "E", "F", "G"),
             c("A", "b", "C", "d", "E", "f", "G"),
             c("aaa_bbb", "ccc", "ddd", "eee", "fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    seqs_fixed <- as.list(tokens(c("a b", "C D", "aa bb", "eEE FFf", "d_e e_f"),
                          what = "fastestword"))
    expect_equivalent(
        as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = TRUE)),
        list(c("a_b", "c_d", "e", "f", "g"),
             c("A_B", "C_D", "E", "F", "G"),
             c("A_b", "C_d", "E", "f", "G"),
             c("aaa", "bbb", "ccc", "ddd", "eee_fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )

    expect_equivalent(
        as.list(tokens_compound(toks, seqs_fixed, valuetype = "glob", case_insensitive = FALSE)),
        list(c("a_b", "c", "d", "e", "f", "g"),
             c("A", "B", "C_D", "E", "F", "G"),
             c("A", "b", "C", "d", "E", "f", "G"),
             c("aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"),
             c("a_b", "b_c", "c_d", "d_e_e_f", "f_g"))
    )
  
})

test_that("tokens_compound join a sequences of sequences", {

    txt <- c("a b c d e f g", "A B C D E F G")
    toks <- tokens(txt)
    seqs <- as.list(tokens(c("a b", "b c d", "E F", "F G"),
                    what = "fastestword"))
    expect_equal(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = TRUE)),
        list(text1 = c("a_b_c_d", "e_f_g"),
             text2 = c("A_B_C_D", "E_F_G"))
    )

    expect_equal(
        as.list(tokens_compound(toks, seqs, valuetype = "glob", case_insensitive = TRUE, join = FALSE)),
        list(text1 = c("a_b", "b_c_d", "e_f", "f_g"),
             text2 = c("A_B", "B_C_D", "E_F", "F_G"))
    )
    
    expect_error(
        tokens_compound(toks, seqs, join = c(TRUE, FALSE)),
        "The length of join must be 1"
    )
})

test_that("keep_unigrams works", {
    
    txt <- "we like high quality sound"
    toks <- tokens(txt)
    
    # overlapped
    pat <- phrase(c("high quality", "quality sound"))
    expect_equal(as.list(tokens_compound(toks, pat, join = TRUE)),
                 list(text1 = c("we", "like", "high_quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat, join = FALSE)),
                 list(text1 = c("we", "like", "high_quality", "quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat, join = TRUE, keep_unigrams = TRUE)),
                 list(text1 = c("we", "like", "high", "quality", "sound",
                                "high_quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat, join = FALSE, keep_unigrams = TRUE)),
                 list(text1 = c("we", "like", "high", "quality", "high_quality",
                                "sound", "quality_sound")))
    
    # nested
    pat2 <- phrase(c("high quality", "quality sound", "high quality sound"))
    expect_equal(as.list(tokens_compound(toks, pat2, join = TRUE)),
                 list(text1 = c("we", "like", "high_quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat2, join = FALSE)),
                 list(text1 = c("we", "like", "high_quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat2, join = TRUE, keep_unigrams = TRUE)),
                 list(text1 = c("we", "like", "high", "quality", "sound",
                                "high_quality_sound")))
    expect_equal(as.list(tokens_compound(toks, pat2, join = FALSE, keep_unigrams = TRUE)),
                 list(text1 = c("we", "like", "high", "quality", "sound",
                                "high_quality_sound")))
    
    expect_error(
        tokens_compound(toks, seqs, keep_unigrams = c(TRUE, FALSE)),
        "The length of keep_unigrams must be 1"
    )

})

test_that("tokens_compound is not affected by the order of compounds", {
    expect_equal(
        as.list(tokens_compound(tokens("The people of the United States of America."),
                                c("United States of America", "United States"))),
        as.list(tokens_compound(tokens("The people of the United States of America."),
                                c("United States", "United States of America")))
    )
})

test_that("tokens_compound preserved document names", {
    expect_equal(
        names(tokens_compound(tokens(c(d1 = "The people of the United States of America.",
                                       d2 = "The United States is south of Canada.")),
                              c("United States", "United States of America"))),
        c("d1", "d2")
    )
})

test_that("tokens_compound works with padded tokens", {
    toks <- tokens(c(doc1 = "a b c d e f g"))
    toks <- tokens_remove(toks, c("b", "e"), padding = TRUE)
    toks <- tokens_compound(toks, phrase("c d"))
    expect_equal(sort(attr(toks, "types")),
                 sort(c("a", "c_d", "f", "g")))
})

test_that("tokens_compound works with different concatenators", {
  toks <- tokens(c(doc1 = "a b c d e f g"))
  
  toks1 <- tokens_compound(toks, phrase("c d"), concatenator = "+")
  expect_equal(sort(attr(toks1, "types")),
               sort(c("a", "b", "c+d", "e", "f", "g")))
  expect_equal(meta(toks1, field = "concatenator", type = "object"), "+")
  
  toks2 <- tokens_compound(toks, phrase("c d"), concatenator = "&&")
  expect_equal(meta(toks2, field = "concatenator", type = "object"), "&&")
  expect_equal(sort(attr(toks2, "types")),
               sort(c("a", "b", "c&&d", "e", "f", "g")))
  
  toks3 <- tokens_compound(toks, phrase("c d"), concatenator = "")
  expect_equal(meta(toks3, field = "concatenator", type = "object"), "")
  expect_equal(sort(attr(toks3, "types")),
               sort(c("a", "b", "cd", "e", "f", "g")))
  expect_error(tokens_compound(toks, phrase("c d"), concatenator = character()),
               "The length of concatenator must be 1")
  
  # update concatenator even without matches
  toks4 <- tokens_compound(toks, phrase("xxxx yyy"), concatenator = "++")
  expect_equal(sort(attr(toks4, "types")),
               sort(c("a", "b", "c", "d", "e", "f", "g")))
  expect_equal(meta(toks4, field = "concatenator", type = "object"), "++")
})

test_that("tokens_compound works with nested tokens", {
    
    toks <- tokens("a b c d")
    pat <- phrase(c("a b", "b c", "a b c"))
    expect_equal(
        as.character(tokens_compound(toks, pat, join = FALSE)),
        c("a_b_c", "d")
    )
    expect_equal(
        as.character(tokens_compound(toks, pat, join = TRUE)),
        c("a_b_c", "d")
    )
})

test_that("tokens_compound works with nested and overlapping tokens", {
    
    toks <- tokens("a b c d e")
    pat <- phrase(c("a b", "a b c", "c d"))
    expect_equal(
        as.character(tokens_compound(toks, pat, join = FALSE)),
        c("a_b_c", "c_d", "e")
    )
    expect_equal(
        as.character(tokens_compound(toks, pat, join = TRUE)),
        c("a_b_c_d", "e")
    )
})

test_that("tokens_compound works with dictionaries", {
    dict <- dictionary(list(taxcgt = c("capital gains tax*"), taxit = "inheritance tax*"))
    toks <- tokens("The new law included capital gains taxes and inheritance taxes.")
    expect_equal(
        as.character(tokens_compound(toks, dict))[c(5, 7)],
        c("capital_gains_taxes", "inheritance_taxes")
    )
    expect_equal(
        tokens_compound(toks, dict),
        tokens_compound(toks, phrase(dict))
    )

    dict <- dictionary(list(tax1 = c("capital gains", "taxes"),
                            tax2 = "gains taxes"))
    expect_equal(
        as.character(tokens_compound(toks, dict, join = TRUE))[5],
        c("capital_gains_taxes")
    )
    expect_equal(
        as.character(tokens_compound(toks, dict, join = FALSE))[5:6],
        c("capital_gains", "gains_taxes")
    )
})

test_that("tokens_compound error when dfm is given, #1006", {
    toks <- tokens("a b c")
    expect_error(tokens_compound(toks, dfm(tokens("b c d"))))
})

test_that("tokens_compound window is working", {

  txt <- c("a b c d e f g")
  toks <- tokens(txt)
  pat <- phrase(c("a b", "c d", "g"))

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE)),
    as.list(tokens(c("a_b c_d e f g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE)),
    as.list(tokens(c("a_b c_d e f g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = 1)),
    as.list(tokens(c("a_b_c_d_e f_g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = 1)),
    as.list(tokens(c("a_b_c b_c_d_e f_g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(1, 0))),
    as.list(tokens(c("a_b_c_d e f_g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = c(1, 0))),
    as.list(tokens(c("a_b b_c_d e f_g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(0, 1))),
    as.list(tokens(c("a_b_c_d_e f g")))
  )
  expect_equal(
    as.list(tokens_compound(toks, pat, join = FALSE, window = c(0, 1))),
    as.list(tokens(c("a_b_c c_d_e f g")))
  )

  expect_equal(
    as.list(tokens_compound(toks, pat, join = TRUE, window = c(100, 100))),
    as.list(tokens(c("a_b_c_d_e_f_g")))
  )
  expect_error(
    tokens_compound(toks, pat, join = FALSE, window = -1),
    "The value of window must be between 0 and Inf"
  )
  expect_error(
    tokens_compound(toks, pat, join = FALSE, window = c(1, 1, 2)),
    "The length of window must be between 1 and 2"
  )
  expect_equal(
    as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = TRUE, window = 1)),
    list(text1 = c("", "b_c_d_e", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(tokens_remove(toks, "a", padding = TRUE), pat, join = FALSE, window = 1)),
    list(text1 = c("", "b_c_d_e", "f_g"))
  )
  toks_pad1 <- tokens_remove(toks, "e", padding = TRUE)
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 1)),
    list(text1 = c("a_b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 1)),
    list(text1 = c("a_b_c", "b_c_d", "", "f_g"))
  )

  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = TRUE, window = 2)),
    list(text1 = c("a_b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad1, pat, join = FALSE, window = 2)),
    list(text1 = c("a_b_c_d", "", "f_g"))
  )

  toks_pad2 <- tokens_remove(toks, c("a", "e"), padding = TRUE)
  expect_equal(
    as.list(tokens_compound(toks_pad2, pat, join = TRUE, window = 1)),
    list(text1 = c("", "b_c_d", "", "f_g"))
  )
  expect_equal(
    as.list(tokens_compound(toks_pad2, pat, join = FALSE, window = 1)),
    list(text1 = c("", "b_c_d", "", "f_g"))
  )

})

test_that("tokens_compound ignores padding", {
    
    toks <- tokens("a b c d e")
    toks <- tokens_remove(toks, "b", padding = TRUE)
    
    toks_comp1 <- tokens_compound(toks, list(c("", "c"), c("d", "e")))
    expect_equal(
        as.character(toks_comp1),
        c("a", "", "c", "d_e")
    )
    
    col <- data.frame(collocation = c(" c", "d e"))
    class(col) <- c("collocations", "textstat", "data.frame")
    toks_comp2 <- tokens_compound(toks, col)
    expect_equal(
        as.character(toks_comp2),
        c("a", "", "c", "d_e")
    )
})


test_that("apply_if argument is working", {
    
    dat <- data.frame(text = c("C++ is a language",
                               "+++ Section C +++"),
                      topic = c("language", "separator"))
    corp <- corpus(dat)
    toks <- tokens(corp)
    
    toks1 <- tokens_compound(toks, phrase("+ +"), concatenator = "",
                             apply_if = toks$topic == "language")
    expect_identical(
        as.list(toks1),
        list(text1 = c("C", "++", "is", "a", "language"),
             text2 = c("+", "+", "+", "Section", "C", "+", "+", "+"))
    )
    
    toks2 <- tokens_compound(toks, "c", window = c(0, 2), concatenator = "",
                             apply_if = toks$topic == "language") %>% 
             tokens_select(min_nchar = 3)
    expect_identical(
        as.list(toks2),
        list(text1 = c("C++", "language"),
             text2 = c("Section"))
    )

})
quanteda/quanteda documentation built on Sept. 4, 2024, 7:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda/quanteda
Quantitative Analysis of Textual Data

tests/testthat/test-tokens_compound.R
In quanteda/quanteda: Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda Quantitative Analysis of Textual Data

tests/testthat/test-tokens_compound.R In quanteda/quanteda: Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda
Quantitative Analysis of Textual Data

tests/testthat/test-tokens_compound.R
In quanteda/quanteda: Quantitative Analysis of Textual Data