tests/testthat/test-kmers.R

test_that("multiplication works", {
  testthat::expect_true(is.list(kmers("ACTGC")))
})

genomes_dir <- file.path("fixtures", "genomes")

test_that("non canonical kmer counting is correct", {
  genomes_filepaths <- list.files(genomes_dir, full.names = TRUE)
  genomes <- lapply(genomes_filepaths, Biostrings::readDNAStringSet)
  tmp_dir <- tempdir()
  unlink(tmp_dir, recursive = TRUE)
  genomes_to_kmer_libsvm(genomes_dir, tmp_dir, k = 2, canonical = FALSE)
  kmer_files <- list.files(tmp_dir, full.names = TRUE)
  # read last file and print
  kmer_data <- readLines(kmer_files[4])
  kmer_data <- unlist(strsplit(kmer_data, " "))
  assert_str <- c("0", "1:3", "2:2", "3:1", "4:2",
                  "5:1", "7:4", "8:6",
                  "9:1", "10:4",
                  "13:2", "14:6", "16:1")
  expect_true(all.equal(kmer_data, assert_str))
})

test_that("canonical kmer counting is correct", {
  genome_str <- "ACACTT"
  kmers <- kmers(genome_str, k = 2, canonical = TRUE, anchor = FALSE)
  expect_true(all.equal(kmers, list(kmer_string = c("AA", "AC", "AG", "CA"),
                                    kmer_value = c(1, 2, 1, 1))))
  })

test_that("test kmer squeeze", {
  for (k in 3:8) {
    all_mers <- kmers("AAAAAAAAAAAAAAA", k = k, anchor = TRUE)
    all_mers <- all_mers$kmer_string
    all_mers <- paste0(all_mers, collapse = "")
    all_counts <- kmers(all_mers, k = k, anchor = TRUE)
    filtered_mers <- all_counts$kmer_string[all_counts$kmer_value > 0]
    squeezed_mers <- kmers("AAAAAAAAAAAAAAA", k = k, anchor = TRUE, squeeze = TRUE)
    squeezed_mers <- squeezed_mers$kmer_string
    expect_true(all.equal(filtered_mers, squeezed_mers))
  }
})

test_that("test kmer to libsvm squeeze", {
  use_genome <- 6
  for (k in 3:8) {
    # calculate without squeeze first - use genomes_to_kmer_libsvm()
    genomes_filepaths <- list.files(genomes_dir, full.names = TRUE)
    tmp_dir <- tempdir()
    unlink(tmp_dir, recursive = TRUE)
    genomes_to_kmer_libsvm(genomes_dir, tmp_dir, k = k, canonical = TRUE, squeeze = FALSE)
    kmer_files <- list.files(tmp_dir, full.names = TRUE)
    kmer_data <- kmer_files[use_genome] |>
      readLines() |>
      strsplit(" ") |>
      unlist()

    # calculate without squeeze - using kmers() directly
    genome_str <- genomes_filepaths[use_genome] |>
      Biostrings::readDNAStringSet() |>
      as.character()
    kmer_data2 <- kmers(genome_str, k = k, canonical = TRUE, squeeze = FALSE)
    # get the index of the last recorded kmer:
    last_kmer <- max(which(kmer_data2$kmer_value > 0))
    # check this is the same as using the libsvm file
    last_kmer_libsvm <- max(as.numeric(strsplit(kmer_data[length(kmer_data)], ":")[[1]][1]))
    expect_true(last_kmer == last_kmer_libsvm)

    # now calculate with squeeze
    unlink(tmp_dir, recursive = TRUE)
    genomes_to_kmer_libsvm(genomes_dir, tmp_dir, k = k, canonical = TRUE, squeeze = TRUE)
    kmer_files <- list.files(tmp_dir, full.names = TRUE)
    kmer_data <- kmer_files[use_genome] |>
      readLines() |>
      strsplit(" ") |>
      unlist()

    # calculate with squeeze - using kmers() directly
    kmer_data2 <- kmers(genome_str, k = k, canonical = TRUE, squeeze = TRUE)
    # get the index of the last recorded kmer:
    last_kmer <- max(which(kmer_data2$kmer_value > 0))
    # check this is the same as using the libsvm file
    last_kmer_libsvm <- max(as.numeric(strsplit(kmer_data[length(kmer_data)], ":")[[1]][1]))
    expect_true(last_kmer == last_kmer_libsvm)
  }
})

test_that("check that can revert the squeeze", {
    use_genome <- 6
    for (k in 3:8) {
      squeezed_mers <- squeezed_mers(k = k)
      calculated_mers <- kmers("AAAAAAAAAAAAAAA", k = k, anchor = TRUE, squeeze = TRUE)
      expect_true(all.equal(squeezed_mers, calculated_mers$kmer_string))

      genomes_filepaths <- list.files(genomes_dir, full.names = TRUE)
      tmp_dir <- tempdir()
      unlink(tmp_dir, recursive = TRUE)
      genomes_to_kmer_libsvm(genomes_dir, tmp_dir, k = k, canonical = TRUE, squeeze = FALSE)
      kmer_files <- list.files(tmp_dir, full.names = TRUE)
      kmer_data_unsqueezed <- kmer_files[use_genome] |>
        readLines() |>
        strsplit(" ") |>
        unlist()

      # calculate with squeeze
      unlink(tmp_dir, recursive = TRUE)
      genomes_to_kmer_libsvm(genomes_dir, tmp_dir, k = k, canonical = TRUE, squeeze = TRUE)
      kmer_files <- list.files(tmp_dir, full.names = TRUE)
      kmer_data_squeezed <- kmer_files[use_genome] |>
        readLines() |>
        strsplit(" ") |>
        unlist()

      expect_false(isTRUE(all.equal(kmer_data_unsqueezed, kmer_data_squeezed)))

      # now unsqueeze
      squeezed_indices <- kmer_data_squeezed[2:length(kmer_data_squeezed)]
      squeezed_indices <- sapply(squeezed_indices, \(x) strsplit(x, ":")[[1]][[1]], USE.NAMES = F)
      squeezed_indices <- as.numeric(squeezed_indices)
      squeezed_str <- squeezed_index_to_str(squeezed_indices, k = k)

      # compare to unsqueezed data
      unsqueezed_indices <- kmer_data_unsqueezed[2:length(kmer_data_unsqueezed)]
      unsqueezed_indices <- sapply(unsqueezed_indices, \(x) strsplit(x, ":")[[1]][[1]], USE.NAMES = F)
      unsqueezed_indices <- as.numeric(unsqueezed_indices)
      unsqueezed_str <- unsqueezed_index_to_str(unsqueezed_indices, k = k)

      expect_true(all.equal(squeezed_str, unsqueezed_str))
    }
})

Try the MIC package in your browser

Any scripts or data that you put into this service are public.

MIC documentation built on April 12, 2025, 2:26 a.m.