tests/testthat/test-tokens_replace.R

txt <- c(doc1 = "aa bb BB cc DD ee",
         doc2 = "aa bb cc DD ee")
toks_test <- tokens(txt)

test_that("tokens_replace works with regular pattern and replacement", {
    
    # equivalent to tokens conversion method
    expect_equal(as.list(tokens_replace(toks_test, types(toks_test), char_toupper(types(toks_test)), "fixed", case_insensitive = FALSE)),
                 as.list(tokens_toupper(toks_test)))
    
    # fixed, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, c('aa', 'bb'), c('a', 'b'), "fixed", case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "b", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # fixed, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, c('aa', 'bb'), c('a', 'b'), "fixed", case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # fixed, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, c('aa', 'aa'), c('a', 'aaa'), "fixed", case_insensitive = FALSE)),
                 list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "bb", "cc", "DD", "ee")))
    
    # fixed, no match
    expect_equal(as.list(tokens_replace(toks_test, "x", "y", "fixed", case_insensitive = FALSE)),
                 as.list(toks_test))
    
    # glob, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, c('a*', 'b*'), c('a', 'b'), "glob", case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "b", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # glob, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, c('a*', 'b*'), c('a', 'b'), "glob", case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # glob, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, c('a*', 'a*'), c('a', 'aaa'), "glob", case_insensitive = FALSE)),
                 list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "bb", "cc", "DD", "ee")))
    
    # glob, no match
    expect_equal(as.list(tokens_replace(toks_test, "x*", "y", "glob", case_insensitive = FALSE)),
                 as.list(toks_test))
    
    # regex, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, c('a.', 'b.'), c('a', 'b'), "regex", case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "b", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # regex, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, c('a.', 'b.'), c('a', 'b'), "regex", case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # regex, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, c('a.', 'a.'), c('a', 'aaa'), "regex", case_insensitive = FALSE)),
                 list(doc1 = c("a", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "bb", "cc", "DD", "ee")))
    
    # regex, no match
    expect_equal(as.list(tokens_replace(toks_test, "x.*", "y", "regex", case_insensitive = FALSE)),
                 as.list(toks_test))
    
    
})

test_that("tokens_replace works with pharsal pattern and replacement", {

    # fixed, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa bb')), phrase(c('a b')), "fixed", 
                                        case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # fixed, single, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA BB')), phrase(c('a b')), "fixed", 
                                        case_insensitive = FALSE)),
                 list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("aa", "bb", "cc", "DD", "ee")))
    
    # fixed, single, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa bb', 'aa bb')), phrase(c('a b', 'aaa bbb')), "fixed", 
                                        case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # glob, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa *')), phrase(c('a b')), "glob", case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # glob, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA *')), phrase(c('a b')), "glob", case_insensitive = FALSE)),
                 list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("aa", "bb", "cc", "DD", "ee")))
    
    # glob, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa *', 'aa bb')), phrase(c('a b', 'aaa bbb')), "glob", 
                                        case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # regex, case-insensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa .*')), phrase(c('a b')), "regex", case_insensitive = TRUE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
    
    # regex, case-sensitive
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('AA .*')), phrase(c('a b')), "regex", case_insensitive = FALSE)),
                 list(doc1 = c("aa", "bb", "BB", "cc", "DD", "ee"), 
                      doc2 = c("aa", "bb", "cc", "DD", "ee")))
    
    # regex, duplicated types in from
    expect_equal(as.list(tokens_replace(toks_test, phrase(c('aa .*', 'aa bb')), phrase(c('a b', 'aaa bbb')), "regex", 
                                        case_insensitive = FALSE)),
                 list(doc1 = c("a", "b", "BB", "cc", "DD", "ee"), 
                      doc2 = c("a", "b", "cc", "DD", "ee")))
})

test_that("tokens_replace can reconstruct original by re-replacement", {
    toks <- tokens(data_corpus_inaugural)
    
    toks_temp1 <- tokens_replace(toks, "it", "ZZZZZZZ", case_insensitive = FALSE)
    expect_equal(as.list(tokens_replace(toks_temp1, "ZZZZZZZ", "it")),
                 as.list(toks))
    
    toks_temp2 <- tokens_replace(toks, phrase("it is"), phrase("XXXXXXX YYYYYYY ZZZZZZZ"), case_insensitive = FALSE)
    expect_equal(as.list(tokens_replace(toks_temp2, phrase("XXXXXXX YYYYYYY ZZZZZZZ"), phrase("it is"))),
                 as.list(toks))
})


test_that("tokens_replace raises error when input values are invalid", {
    # error when lenfths of from and to are different
    expect_error(tokens_replace(toks_test, c('aa', 'bb'), c('a')),
                 "The length of pattern and replacement must be the same")
    
    expect_error(tokens_replace(toks_test, c(1, 2), c(10, 20), valuetype = "fixed"),
                 "The type of x must be character")
    
    # does nothing when input vector is zero length
    expect_equal(tokens_replace(toks_test, character(), character()),
                 toks_test)
})

test_that("tokens_replace remove tokens when replacement is empty", {
    
    pat1 <- list(c("aa", "bb"), c("dd", "ee"))
    rep1 <- list(character(), c("zz", "zz"))
    pat2 <- list(c("aa", "bb"), c("bb", "cc"))
    rep2 <- list(c("zz", "zz"), character())
    
    expect_equal(as.list(tokens_replace(toks_test, pat1, rep1)),
                 list(doc1 = c("BB", "cc", "zz", "zz"), 
                      doc2 = c("cc","zz", "zz")))
    expect_equal(as.list(tokens_replace(toks_test, pat2, rep2)),
                 list(doc1 = c("zz", "zz", "DD", "ee"), 
                      doc2 = c("zz","zz", "DD", "ee")))
                 
})

test_that("tokens_replace works with same replacement characters (#1765)", {
    
    exp1 <- list(doc1 = c("bb", "bb", "BB", "bb", "DD", "ee"), 
                 doc2 = c("bb", "bb", "bb", "DD", "ee"))
    expect_equal(
        as.list(tokens_replace(toks_test, c("aa", "cc"), c("bb", "bb"), valuetype = "fixed")),
        exp1
    )
    expect_equal(
        as.list(tokens_replace(toks_test, c("aa", "cc"), c("bb", "bb"), valuetype = "glob")),
        exp1
    )
    expect_equal(
        as.list(tokens_replace(toks_test, c("a.*", "cc"), c("bb", "bb"), valuetype = "regex")),
        exp1
    )
    
    exp2 <- list(doc1 = c("bb", "BB", "bb", "ee"), 
                 doc2 = c("bb", "bb", "ee"))
    
    expect_equal(
        as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")), 
                   c("bb", "bb"), valuetype = "fixed")),
        exp2
    )
    expect_equal(
        as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")), 
                   c("bb", "bb"), valuetype = "glob")),
        exp2
    )
    expect_equal(
        as.list(tokens_replace(toks_test, list(c("a.*", "bb"), c("cc", "DD")), 
                   c("bb", "bb"), valuetype = "regex")),
        exp2
    )
    
    exp3 <- list(doc1 = c("zz", "bb", "BB", "zz", "DD", "ee"), 
                 doc2 = c("zz", "bb", "zz", "DD", "ee"))
    
    expect_equal(
        as.list(tokens_replace(toks_test, c("aa", "cc"), c("zz", "zz"), valuetype = "fixed")),
        exp3
    )
    expect_equal(
        as.list(tokens_replace(toks_test, c("aa", "cc"), c("zz", "zz"), valuetype = "glob")),
        exp3
    )
    expect_equal(
        as.list(tokens_replace(toks_test, c("a.*", "cc"), c("zz", "zz"), valuetype = "regex")),
        exp3
    )
    
    exp4 <- list(doc1 = c("zz", "BB", "zz", "ee"), 
                 doc2 = c("zz", "zz", "ee"))

    expect_equal(
        as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")), 
                       c("zz", "zz"), valuetype = "glob")),
        exp4
    )
    expect_equal(
        as.list(tokens_replace(toks_test, list(c("aa", "bb"), c("cc", "DD")), 
                       c("zz", "zz"), valuetype = "glob")),
        exp4
    )
    expect_equal(
        as.list(tokens_replace(toks_test, list(c("a.*", "bb"), c("cc", "DD")), 
                       c("zz", "zz"), valuetype = "regex")),
        exp4
    )
})

test_that("apply_if argument is working", {
    
    dat <- data.frame(text = c("C language",
                               "Vitamin C"),
                      topic = c("language", "vitamin"))
    corp <- corpus(dat)
    toks <- tokens(corp)
    
    toks1 <- tokens_replace(toks, "C", "R", apply_if = toks$topic == "language")
    expect_identical(
        as.list(toks1),
        list(text1 = c("R", "language"),
             text2 = c("Vitamin", "C"))
    )
    
    toks2 <- tokens_replace(toks, "C", "D", apply_if = toks$topic == "vitamin")
    expect_identical(
        as.list(toks2),
        list(text1 = c("C", "language"),
             text2 = c("Vitamin", "D"))
    )
    
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.