tests/testthat/test-corpus_segment.R

test_that("char_segment works with punctuations", {
    txt <- c(d1 = "Sentence one.  Second sentence is this one!\n
             Here is the third sentence.",
             d2 = "Only sentence of doc2?  No there is another.")
    char_seg <- char_segment(txt, "\\p{P}", valuetype = "regex", 
                             remove_pattern = FALSE, pattern_position = "after")
    expect_equal(char_seg[4], c(d2.1 = "Only sentence of doc2?"))
})

test_that("char_segment works for tags", {
    txt <- c("##INTRO This is the introduction.
                           ##DOC1 This is the first document.
                           Second sentence in Doc 1.
                           ##DOC3 Third document starts here.
                           End of third document.",
                           "##INTRO Document ##NUMBER Two starts before ##NUMBER Three.")
    char_seg <- char_segment(txt, "##[A-Z0-9]+", valuetype = "regex",
                                pattern_position = "before", remove_pattern = TRUE)
    expect_equal(char_seg[5], "Two starts before")
})

test_that("char_segment works for glob customized tags", {
    txt <- c("INTRO: This is the introduction. 
                           DOC1: This is the first document.  
                           Second sentence in Doc 1.  
                           DOC3: Third document starts here.  
                           End of third document.",
             "INTRO: Document NUMBER: Two starts before NUMBER: Three.")
    char_seg <- char_segment(txt, "*:", valuetype = "glob", 
                                pattern_position = "before", remove_pattern = TRUE)
    expect_equal(char_seg[6], "Three.")
})

test_that("char_segment works for glob customized tags, test 2", {
    txt <- c("[INTRO] This is the introduction. 
                           [DOC1] This is the first document.  
                           Second sentence in Doc 1.  
                           [DOC3] Third document starts here.  
                           End of third document.",
             "[INTRO] Document [NUMBER] Two starts before [NUMBER] Three.")
    char_seg <- char_segment(txt, "[*]", valuetype = "glob", 
                                pattern_position = "before", remove_pattern = TRUE)
    expect_equal(char_seg[6], "Three.")
})

test_that("corpus_segment works with blank before tag", {
    corp <- corpus(c("\n##INTRO This is the introduction.
                        ##DOC1 This is the first document.  Second sentence in Doc 1.
                        ##DOC3 Third document starts here.  End of third document.",
                        "##INTRO Document ##NUMBER Two starts before ##NUMBER Three."))
    corp_seg <- corpus_segment(corp, "##[A-Z0-9]+", valuetype = "regex", 
                               pattern_position = "before", extract_pattern = TRUE)
    summ <- summary(corp_seg)
    expect_equal(summ[1, "Tokens"], 5)
    expect_equal(as.character(summ[1, "Text"]), "text1.1")
})

test_that("corpus_segment works with use_docvars TRUE or FALSE", {
    corp <- corpus(c(d1 = "##TEST One two ##TEST2 Three",
                     d2 = "##TEST3 Four", 
                     d3 = ""),
                     docvars = data.frame(test = c("A", "B", "B"), stringsAsFactors = FALSE))
    corp_seg1 <- corpus_segment(corp, "##[A-Z0-9]+", valuetype = "regex", 
                                pattern_position = "before", extract_pattern = TRUE, 
                                use_docvars = TRUE)
    expect_equal(docvars(corp_seg1, "test"), c("A", "A", "B"))
    expect_equal(docid(corp_seg1), 
                 factor(c("d1", "d1", "d2"), levels = c("d1", "d2", "d3")))
    expect_equal(segid(corp_seg1), 
                 c(1L, 2L, 1L))
    
    corp_seg2 <- corpus_segment(corp, "##[A-Z0-9]+", valuetype = "regex", 
                                pattern_position = "before", extract_pattern = TRUE, use_docvars = FALSE)
    expect_error(docvars(corp_seg2, "test"))
    expect_equal(docid(corp_seg2), 
                 factor(c("d1", "d1", "d2"), levels = c("d1", "d2", "d3")))
    expect_equal(segid(corp_seg2), 
                 c(1L, 2L, 1L))
})

test_that("char_segment works with Japanese texts", {
    
    skip_on_os("windows")
    txt <- "日本語の終止符は.ではない。しかし、最近は.が使われることある。"
    expect_equal(char_segment(txt, '\\p{P}', valuetype = "regex", 
                              remove_pattern = FALSE, pattern_position = "after"),
                 c("日本語の終止符は.", "ではない。", "しかし、", "最近は.", "が使われることある。"))
    
    expect_equal(char_segment(txt, '。', valuetype = "fixed", remove_pattern = FALSE,
                              pattern_position = "after"),
                 c("日本語の終止符は.ではない。", "しかし、最近は.が使われることある。"))
})

test_that("corpus_segment works with position argument", {
    
    corp1 <- corpus(c(d1 = "##TEST One two ##TEST2 Three",
                      d2 = "##TEST3 Four"))
    corp1_seg <- corpus_segment(corp1, '##', valuetype = 'fixed', 
                                pattern_position = "before", extract_pattern = FALSE)
    expect_equal(as.character(corp1_seg), c("d1.1" = "##TEST One two",
                                     "d1.2" = "##TEST2 Three",
                                     "d2.1" = "##TEST3 Four"))
    
    corp2 <- corpus(c(d1 = "TEST One two; TEST2 Three;",
                      d2 = "TEST3 Four;"))
    corp2_seg <- corpus_segment(corp2, ';', valuetype = 'fixed', 
                                pattern_position = 'after', extract_pattern = FALSE)
    expect_equal(as.character(corp2_seg), c("d1.1" = "TEST One two;",
                                     "d1.2" = "TEST2 Three;",
                                     "d2.1" = "TEST3 Four;"))
    
    corp3 <- corpus(c(d1 = "**TEST One two ##TEST2 Three",
                      d2 = "??TEST3 Four"))
    corp3_seg <- corpus_segment(corp3, '[*#?]{2}', valuetype = 'regex', 
                                pattern_position = 'before', extract_pattern = FALSE)
    expect_equal(as.character(corp3_seg), c("d1.1" = "**TEST One two",
                                     "d1.2" = "##TEST2 Three",
                                     "d2.1" = "??TEST3 Four"))
    
    corp4 <- corpus(c(d1 = "TEST One two; TEST2 Three?",
                      d2 = "TEST3 Four!"))
    corp4_seg <- corpus_segment(corp4, '[!?;]', valuetype = 'regex', pattern_position = 'after',
                                extract_pattern = FALSE)
    expect_equal(as.character(corp4_seg), c("d1.1" = "TEST One two;",
                                     "d1.2" = "TEST2 Three?",
                                     "d2.1" = "TEST3 Four!"))
    
})


test_that("corpus_segment works for delimiter with remove_pattern", {

    txt <- c(d1 = "Sentence one.  Second sentence is this one!\n
                   Here is the third sentence.",
             d2 = "Only sentence of doc2?  No there is another.")

    corp <- corpus(txt, docvars = data.frame(title = c("doc1", "doc2")))
    corp_seg1 <- corpus_segment(corp, '[.!?]', valuetype = 'regex',
                                  pattern_position = "after",
                                  extract_pattern = FALSE)

    expect_equal(as.character(corp_seg1),
                 c(d1.1 = "Sentence one.",
                   d1.2 = "Second sentence is this one!",
                   d1.3 = "Here is the third sentence.",
                   d2.1 = "Only sentence of doc2?",
                   d2.2 = "No there is another."))

    corp_seg2 <- corpus_segment(corp, pattern = '[.!?]', valuetype = 'regex',
                                  pattern_position = "after",
                                  extract_pattern = TRUE)
    expect_equal(as.character(corp_seg2),
                 c(d1.1 = "Sentence one",
                   d1.2 = "Second sentence is this one",
                   d1.3 = "Here is the third sentence",
                   d2.1 = "Only sentence of doc2",
                   d2.2 = "No there is another"))
})

test_that("tag extraction works", {
    corp <- corpus(c("#tag1 Some text. #tag2 More text."))
    
    corpseg <- corpus_segment(corp, pattern = "#*", extract_pattern = TRUE)
    expect_equal(
        docvars(corpseg, "pattern"),
        c("#tag1", "#tag2")
    )
    
    corpseg <- corpus_segment(corp, pattern = "#*", extract_pattern = FALSE)
    expect_error(
        docvars(corpseg, "pattern"),
        "field\\(s\\) pattern not found"
    )
})

test_that("corpus_segment works for begining and end tags", {
    corp <- corpus(c(d1 = "##START ##INTRO This is the introduction.
                           ##DOC1 This is the first document.  Second sentence in Doc 1.
                           ##DOC3 Third document starts here.  End of third document.",
                     d2 = "##INTRO Document ##NUMBER Two starts before ##NUMBER Three. ##END"))
    
    corp_seg1 <- corpus_segment(corp, "##*", pattern_position = "before", extract_pattern = TRUE)
    expect_equal(head(docvars(corp_seg1, "pattern"), 1), "##START")
    expect_equal(head(as.character(corp_seg1), 1), c(d1.1 = ""))
    expect_equal(tail(docvars(corp_seg1, "pattern"), 1), "##END")
    expect_equal(tail(as.character(corp_seg1), 1), c(d2.4 = ""))
    
    corp_seg2 <- corpus_segment(corp, "##*", pattern_position = "before", extract_pattern = FALSE)
    expect_error(docvars(corp_seg2, "pattern"))
    expect_equal(head(as.character(corp_seg2), 1), c(d1.1 = "##START"))
    expect_equal(tail(as.character(corp_seg2), 1), c(d2.4 = "##END"))
    
    corp_seg3 <- corpus_segment(corp, "##*", pattern_position = "after", extract_pattern = TRUE)
    expect_equal(head(docvars(corp_seg3, "pattern"), 1), "##START")
    expect_equal(head(as.character(corp_seg3), 1), c(d1.1 = ""))
    expect_equal(tail(docvars(corp_seg3, "pattern"), 1), "##END")
    expect_equal(tail(as.character(corp_seg3), 1), c(d2.4 = "Three."))
    
    corp_seg4 <- corpus_segment(corp, "##*", pattern_position = "after", extract_pattern = FALSE)
    expect_error(docvars(corp_seg4, "pattern"))
    expect_equal(head(as.character(corp_seg4), 1), c(d1.1 = "##START"))
    expect_equal(tail(as.character(corp_seg4), 1), c(d2.4 = "Three. ##END"))
    
})

test_that("corpus_segment works with multiple patterns (#1394)", {
    
    txt <- "Some text, more text\\nINTRODUCTION This is a test\\nCONTACT John Doe SOURCE Library of Congress"
    
    expect_identical(char_segment(txt, c("INTRODUCTION*", "CONTACT", "SOURCE"), valuetype = "glob"),
                     c("This is a test\\n", "John Doe", "Library of Congress"))
    
    expect_identical(char_segment(txt, c("INTRODUCTION", "CONTACT", "SOURCE"), valuetype = "fixed"),
                     c("This is a test\\n", "John Doe", "Library of Congress"))
    
    
    expect_identical(as.character(corpus_segment(corpus(txt), c("INTRODUCTION*", "CONTACT", "SOURCE"), valuetype = "glob")),
                     c(text1.1 = "This is a test\\n", text1.2 = "John Doe", text1.3 = "Library of Congress"))
    
    expect_identical(as.character(corpus_segment(corpus(txt), c("INTRODUCTION", "CONTACT", "SOURCE"), valuetype = "fixed")),
                     c(text1.1 = "This is a test\\n", text1.2 = "John Doe", text1.3 = "Library of Congress"))
    
})

test_that("corpus_segment works can be used multiple times", {
    corp <- corpus(c("##DOC1 First sentences in Doc 1.  Second sentence in Doc 1.
                      ##DOC2 First sentences in Doc 2.  Second sentence in Doc 2."))
    corp_seg <- corpus_segment(corp, "##[A-Z0-9]+", valuetype = "regex", 
                               pattern_position = "before", extract_pattern = TRUE)
    corp_seg2 <- corpus_segment(corp_seg, ".", valuetype = "fixed", 
                                pattern_position = "after", extract_pattern = TRUE)
    
    expect_equal(as.character(corp_seg2), c("text1.1" = "First sentences in Doc 1",
                                     "text1.2" = "Second sentence in Doc 1",
                                     "text1.3" = "First sentences in Doc 2",
                                     "text1.4" = "Second sentence in Doc 2"))
})

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.