tests/testthat/test_count_ngrams.R

context("Counting n-grams")

test_that("Count ngrams for different distances",{
  sample_seq <- c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 
                  2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L)
  
  len1 <- sum(count_ngrams(sample_seq, 3, 1L:4, d = 0))
  len2 <- sum(count_ngrams(sample_seq, 3, 1L:4, d = 1))
  len3 <- sum(count_ngrams(sample_seq, 3, 1L:4, d = c(2, 1)))
  len4 <- sum(count_ngrams(sample_seq, 3, 1L:4, d = c(2, 2)))
  
  expect_equal(len1, 28)
  expect_equal(len2, 26)
  expect_equal(len3, 25)
  expect_equal(len4, 24)
  
  #check specific structure of the object
  proper_obj <- structure(c(2, 2, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 
                            3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                            0, 0, 0, 0, 0), 
                          .Dim = c(1L, 64L), 
                          .Dimnames = list(NULL, c("1.1.1_2.2", "2.1.1_2.2", "3.1.1_2.2", 
                                                   "4.1.1_2.2", "1.2.1_2.2", "2.2.1_2.2", 
                                                   "3.2.1_2.2", "4.2.1_2.2", "1.3.1_2.2", 
                                                   "2.3.1_2.2", "3.3.1_2.2", "4.3.1_2.2", 
                                                   "1.4.1_2.2", "2.4.1_2.2", "3.4.1_2.2", 
                                                   "4.4.1_2.2", "1.1.2_2.2", "2.1.2_2.2", 
                                                   "3.1.2_2.2", "4.1.2_2.2", "1.2.2_2.2", 
                                                   "2.2.2_2.2", "3.2.2_2.2", "4.2.2_2.2", 
                                                   "1.3.2_2.2", "2.3.2_2.2", "3.3.2_2.2", 
                                                   "4.3.2_2.2", "1.4.2_2.2", "2.4.2_2.2", 
                                                   "3.4.2_2.2", "4.4.2_2.2", "1.1.3_2.2", 
                                                   "2.1.3_2.2", "3.1.3_2.2", "4.1.3_2.2", 
                                                   "1.2.3_2.2", "2.2.3_2.2", "3.2.3_2.2", 
                                                   "4.2.3_2.2", "1.3.3_2.2", "2.3.3_2.2", 
                                                   "3.3.3_2.2", "4.3.3_2.2", "1.4.3_2.2", 
                                                   "2.4.3_2.2", "3.4.3_2.2", "4.4.3_2.2", 
                                                   "1.1.4_2.2", "2.1.4_2.2", "3.1.4_2.2", 
                                                   "4.1.4_2.2", "1.2.4_2.2", "2.2.4_2.2", 
                                                   "3.2.4_2.2", "4.2.4_2.2", "1.3.4_2.2", 
                                                   "2.3.4_2.2", "3.3.4_2.2", "4.3.4_2.2", 
                                                   "1.4.4_2.2", "2.4.4_2.2", "3.4.4_2.2", 
                                                   "4.4.4_2.2")))
  
  expect_equal(as.matrix(count_ngrams(sample_seq, 3, 1L:4, d = c(2, 2))), proper_obj)
})


test_that("Count ngrams for different positions", {
  sample_seq <- c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L)
  
  proper_obj <- structure(c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                            0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                            1, 0, 0, 0, 0, 0, 0, 1, 1, 0), 
                          .Dim = c(1L, 48L), 
                          .Dimnames = list(NULL, 
                                           c("1_1.1.1_0.0", "2_1.1.1_0.0", "3_1.1.1_0.0", 
                                             "4_1.1.1_0.0", "5_1.1.1_0.0", "6_1.1.1_0.0", 
                                             "1_2.1.1_0.0", "2_2.1.1_0.0", "3_2.1.1_0.0", 
                                             "4_2.1.1_0.0", "5_2.1.1_0.0", "6_2.1.1_0.0", 
                                             "1_1.2.1_0.0", "2_1.2.1_0.0", "3_1.2.1_0.0", 
                                             "4_1.2.1_0.0", "5_1.2.1_0.0", "6_1.2.1_0.0", 
                                             "1_2.2.1_0.0", "2_2.2.1_0.0", "3_2.2.1_0.0", 
                                             "4_2.2.1_0.0", "5_2.2.1_0.0", "6_2.2.1_0.0", 
                                             "1_1.1.2_0.0", "2_1.1.2_0.0", "3_1.1.2_0.0", 
                                             "4_1.1.2_0.0", "5_1.1.2_0.0", "6_1.1.2_0.0", 
                                             "1_2.1.2_0.0", "2_2.1.2_0.0", "3_2.1.2_0.0", 
                                             "4_2.1.2_0.0", "5_2.1.2_0.0", "6_2.1.2_0.0", 
                                             "1_1.2.2_0.0", "2_1.2.2_0.0", "3_1.2.2_0.0", 
                                             "4_1.2.2_0.0", "5_1.2.2_0.0", "6_1.2.2_0.0", 
                                             "1_2.2.2_0.0", "2_2.2.2_0.0", "3_2.2.2_0.0", 
                                             "4_2.2.2_0.0", "5_2.2.2_0.0", "6_2.2.2_0.0"
                                           )))
  expect_equal(as.matrix(count_ngrams(sample_seq, 3, 1L:2, pos = TRUE)), proper_obj)
})
michbur/biogram documentation built on Feb. 4, 2024, 6:38 p.m.