tests/testthat/test_search_contexts.r

testthat::context('Search Contexts')

test_that("Query document search works", {
  text = c('Renewable fuel is better than fossil fuels!',
           'A fueled debate about fuel',
           'Mark Rutte is simply Rutte. Bos, on the other hand, is not always Wouter',
           'Hey, A ~ symbol!! Can I match that?')
  tc = create_tcorpus(text, doc_id = c('a','b','c','d'), split_sentences = T)

  hits = search_contexts(tc, 'mark AND rutte')
  expect_equal(as.character(hits$hits$doc_id), 'c')

  hits_not = search_contexts(tc, 'mark AND rutte', not=T)
  expect_equal(as.character(hits_not$hits$doc_id), c('a','b','d'))
  
  hits = search_contexts(tc, '"mark rutte"', context_level = 'sentence')
  expect_equal(hits$hits$sentence, 1)
  
  hits_not = search_contexts(tc, '"mark rutte"', context_level = 'sentence', not=T)
  expect_equal(hits_not$hits$sentence, c(1,1,2,1,2))

  ## test context boundaries
  hits = search_contexts(tc, '"rutte bos"~5', context_level = 'document') ## should find rutte and bos across sentences
  expect_true(nrow(hits$hits) > 0)

  
  hits = search_contexts(tc, '"rutte bos"~5', context_level = 'sentence') ## should not find rutte and bos across sentences
  expect_true(nrow(hits$hits) == 0)

  ## proximity search
  hits = search_contexts(tc, '"bos wouter"~5', context_level = 'sentence') # bos and wouter within a token distance of 5
  expect_true(nrow(hits$hits) == 0)

  hits = search_contexts(tc, '"bos wouter"~10', context_level = 'sentence') #  bos and wouter within a token distance of 10
  expect_true(nrow(hits$hits) > 0)

  hits = search_contexts(tc, 'wouter NOT "bos wouter"~10', context_level = 'sentence') # wouter should not occur within 10 tokens from bos
  expect_true(nrow(hits$hits) == 0)

  hits = search_contexts(tc, 'wouter NOT "bos wouter"~3', context_level = 'sentence') # wouter should not occur within 10 tokens from bos
  expect_true(nrow(hits$hits) == 1)

  ## BOOLEAN
  hits = search_contexts(tc, 'wouter AND bos')
  expect_true(nrow(hits$hits) == 1)
  hits = search_contexts(tc, 'wouter NOT bos')
  expect_true(nrow(hits$hits) == 0)
  hits = search_contexts(tc, 'wouter NOT (bos OR banaan)') # neither bos nor banaan may occur
  expect_true(nrow(hits$hits) == 0)
  hits = search_contexts(tc, 'wouter NOT (bos AND banaan)') # bos and banaan may not occur together
  expect_true(nrow(hits$hits) == 1)

  ## case sensitive
  hits = search_contexts(tc, 'bos~s')
  expect_true(nrow(hits$hits) == 0)
  hits = search_contexts(tc, 'Bos~s')
  expect_true(nrow(hits$hits) == 1)

  tc = create_tcorpus(text, doc_id = c('a','b','c','d'), split_sentences = T)
  hits = search_contexts(tc, '"wouter bos"~10s') ## if flag on quotes, all within quotes needs to be case sensitive

  expect_true(nrow(hits$hits) == 0)
  hits = search_contexts(tc, '"Wouter Bos"~s10')
  expect_true(nrow(hits$hits) == 1)
  expect_equal(search_contexts(tc, '"Wouter Bos"~10s')$hits, search_contexts(tc, '"Wouter Bos"~s10')$hits) ## order of flags is irrelevant

  ## using special characters (other than ?, * or ~)
  hits = search_features(tc, '!')
  expect_equal(as.character(hits$hits$doc_id), c('a','d','d'))

  ## query subsetting
  tc = create_tcorpus(text, doc_id = c('a','b','c','d'), split_sentences = T)
  tc_rutte = subset_query(tc, '"mark rutte"~2', context_level = 'sentence')
  expect_equal(tc_rutte$get_meta('doc_id'), 'c')
  
  tc_not_rutte = subset_query(tc, '"mark rutte"~2', context_level = 'sentence', not=T)
  expect_equal(tc_not_rutte$get_meta('doc_id'), c('a','b','c','d'))
})

Try the corpustools package in your browser

Any scripts or data that you put into this service are public.

corpustools documentation built on May 31, 2023, 8:45 p.m.