tests/testthat/test--taxmap_parsers.R

library(testthat)
context("taxmap parsers")

test_that("Taxmap can be intialized from complex data", {

  # Basic parsing
  my_vector <- c("A;B;C;D", "A;E;F;G", "A;B;H;I")
  my_list_1 <- list("A;B;C;D", "A;E;F;G", c("A;B", "H;I"))
  my_list_2 <- list(c(Phylum = "A", Class = "B", Order = "C", Family = "D"),
                    c(Phylum = "A", Class = "E", Order = "F", Family = "G"),
                    c(Phylum = "A", Class = "B", Order = "H", Family = "I"))
  my_frame <- data.frame(tax = c("A;B;C", "A;E;F", "A;B;H"),
                         species = c("D", "G", "I"))
  my_frames <- list(data.frame(tax = c("A", "B", "C", "D")),
                    data.frame(tax = c("A", "E", "F", "G")),
                    data.frame(tax = c("A", "B", "H", "I")))

  vector_result <- parse_tax_data(my_vector, include_tax_data = FALSE)
  list_1_result <- parse_tax_data(my_list_1, include_tax_data = FALSE)
  list_2_result <- parse_tax_data(my_list_2, include_tax_data = FALSE)
  frame_result <- parse_tax_data(my_frame, class_cols = c("tax", "species"),
                                 include_tax_data = FALSE)

  expect_equal(length(vector_result$taxon_ids()), 9)
  expect_equal(length(vector_result$roots()), 1)
  expect_equal(vector_result, list_1_result)
  expect_equal(vector_result, list_2_result)
  expect_equal(vector_result, frame_result)

  # Incorperating rank information in lists of vectors
  result <- parse_tax_data(my_list_2, include_tax_data = FALSE,
                           named_by_rank = TRUE)
  expect_true(all(c("Phylum", "Class", "Order", "Family") %in% result$taxon_ranks()))

  # Basic parsing with datasets
  test_obj <- parse_tax_data(my_vector, list(test = letters[1:3]),
                             mappings = c("{{index}}" = "{{index}}"))
  expect_equal(test_obj$map_data(test, taxon_names),
               structure(c("D", "G", "I"), .Names = c("a", "b", "c")))

  a_dataset <- data.frame(my_index = c(3, 2),
                          dataset_key = c("key_3", "key_2"))
  rownames(a_dataset) <- c("name_3", "name_2")
  a_tax_data <- data.frame(tax = c("A;B;C", "A;E;F", "A;B;H"),
                           species = c("D", "G", "I"),
                           tax_key = c("key_1", "key_2", "key_3"))
  rownames(a_tax_data) <- c("name_1", "name_2", "name_3")
  test_obj <- parse_tax_data(a_tax_data, class_cols = c("tax", "species"),
                             datasets = list(my_data = a_dataset),
                             mappings = c("{{index}}" = "my_index"))
  expect_equal(test_obj$data$my_data$taxon_id, c("j", "i"))
  test_obj <- parse_tax_data(a_tax_data, class_cols = c("tax", "species"),
                             datasets = list(my_data = a_dataset),
                             mappings = c("{{name}}" = "{{name}}"))
  expect_equal(test_obj$data$my_data$taxon_id, c("j", "i"))
  test_obj <- parse_tax_data(a_tax_data, class_cols = c("tax", "species"),
                             datasets = list(my_data = a_dataset),
                             mappings = c("tax_key" = "dataset_key"))
  expect_equal(test_obj$data$my_data$taxon_id, c("j", "i"))

  # Parsing lists of data frames with data
  my_frames <- list(data.frame(tax = c("A", "B", "C", "D"),
                               my_rank = c("P", "C", "O", "F")),
                    data.frame(tax = c("A", "E", "F", "G"),
                               my_rank = c("P", "C", "O", "F")),
                    data.frame(tax = c("A", "B", "H", "I"),
                               my_rank = c("P", "C", "O", "F")))
  test_obj <- parse_tax_data(my_frames, class_cols = "tax")
  expect_equal(length(test_obj$taxon_ids()), nrow(test_obj$data$tax_data))
  expect_true("my_rank" %in% colnames(test_obj$data$tax_data))

  # Parsing complex classifications
  raw_data <- c("K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__leo",
                "K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__tigris",
                "K__Mammalia;P__Carnivora;C__Felidae;G__Ursus;S__americanus")
  result <- parse_tax_data(raw_data, class_sep = ";", class_regex = "^(.+)__(.+)$",
                 class_key = c(my_rank = "taxon_rank", tax_name = "taxon_name"),
                 include_match = FALSE)
  expect_true(all(c("K", "P", "C", "G", "S") %in% result$taxon_ranks()))
  expect_error(parse_tax_data(raw_data, class_sep = ";", class_regex = "^(.+)__(.+)$",
                              class_key = c(my_rank = "taxon_rank", tax_name = "taxon_name"),
                              include_match = FALSE, named_by_rank = TRUE))

  # Check that class_key names can be ommited
  result <- parse_tax_data(raw_data, class_sep = ";", class_regex = "^(.+)__(.+)$",
                           class_key = c("taxon_rank", "taxon_name"))
  expect_true(all(c("taxon_rank_match", "taxon_name_match") %in% colnames(result$data$class_data)))
  result <- parse_tax_data(raw_data, class_sep = ";", class_regex = "^(.+)__(.+)(.+)(.+)$",
                           class_key = c("taxon_rank", "taxon_name", "info", "info"))
  expect_true("info_match_1" %in% colnames(result$data$class_data))
  result <- parse_tax_data(raw_data, class_sep = ";", class_regex = "^(.+)__(.+)(.+)(.+)$",
                           class_key = c("taxon_rank", "taxon_name", "info", x = "info"))
  expect_true(all(c("info_match", "x") %in% colnames(result$data$class_data)))

  # # Check for data names that are the same as function names
  # expect_warning(parse_tax_data(raw_data, class_sep = ";",
  #                               class_regex = "^(.+)__(.+)$",
  #                               class_key = c(rank = "info",
  #                                             tax_name = "taxon_name"),
  #                               include_match = FALSE),
  #                "same name as functions")

  # Invalid regex error
  expect_error(parse_tax_data(raw_data, class_sep = ";",
                                class_regex = "^(.+)_invalid_(.+)$",
                                class_key = c(rank = "info",
                                              tax_name = "taxon_name"),
                                include_match = FALSE),
                 "could not be matched by the regex supplied")

  # Expect an error if bad column name
  expect_error(parse_tax_data(raw_data, class_cols = "Not a column"),
               'No item')
  expect_error(parse_tax_data(1:3, class_cols = "Not a column"),
               'No item named')
  expect_error(parse_tax_data(1:3, class_cols = 10),
               'out of bounds for inputs:')

  # Invalid mapping options
  expect_error(parse_tax_data(1:3, datasets = list(), mappings = 1),
               'must have the same number of values')
  expect_error(parse_tax_data(1:3, datasets = 1, mappings = 1),
               'The mapping options must be named.')
  expect_error(parse_tax_data(1:3, datasets = 1, mappings = c(sdaff = 2)),
               'Invalid inputs to the `mappings` found')


  # Zero-length inputs produce empty taxmap objects
  expect_equal(taxon_names(parse_tax_data(c())),          character(0))
  expect_equal(taxon_names(parse_tax_data(character(0))), character(0))
  expect_equal(taxon_names(parse_tax_data(list())),       character(0))
  expect_equal(taxon_names(parse_tax_data(data.frame())), character(0))

})


test_that("Taxmap can be intialized from queried data", {

  skip_on_cran()

  # Make test data
  raw_data <- data.frame(tax = c("Mammalia;Carnivora;Felidae",
                                 "Mammalia;Carnivora;Felidae",
                                 "Mammalia;Carnivora;Ursidae"),
                         species = c("Panthera leo",
                                     "Panthera tigris",
                                     "Ursus americanus"),
                         my_tax_id = c("9689", "9694", "9643"),
                         my_seq = c("AB548412", "FJ358423", "DQ334818"),
                         species_id = c("A", "B", "C"))
  abundance <- data.frame(id = c("A", "B", "C", "A", "B", "C"),
                          sample_id = c(1, 1, 1, 2, 2, 2),
                          counts = c(23, 4, 3, 34, 5, 13))
  common_names <- c(A = "Lion", B = "Tiger", C = "Bear", "Oh my!")
  foods <- list(c("ungulates", "boar"),
                c("ungulates", "boar"),
                c("salmon", "fruit", "nuts"))

  # Parsing with taxon names
  Sys.sleep(1)
  name_result = lookup_tax_data(raw_data,
                                type = "taxon_name",
                                datasets = list(counts = abundance,
                                                my_names = common_names,
                                                foods = foods),
                                mappings = c("species_id" = "id",
                                             "species_id" = "{{name}}",
                                             "{{index}}" = "{{index}}"),
                                column = "species")
  Sys.sleep(1)

  # Parsing with fuzzy taxon name matches
  expect_equal(lookup_tax_data("poa annus", type = "fuzzy_name")$taxon_names(),
               lookup_tax_data("Poa annua", type = "taxon_name")$taxon_names())
  Sys.sleep(1)


  # Parsing with taxon ids
  id_result = lookup_tax_data(raw_data,
                              type = "taxon_id",
                              datasets = list(counts = abundance,
                                              my_names = common_names,
                                              foods = foods),
                              mappings = c("species_id" = "id",
                                           "species_id" = "{{name}}",
                                           "{{index}}" = "{{index}}"),
                              column = "my_tax_id")
  Sys.sleep(1)


  # Parsing with sequence ids
  seq_result = lookup_tax_data(raw_data,
                               type = "seq_id",
                               datasets = list(counts = abundance,
                                               my_names = common_names,
                                               foods = foods),
                               mappings = c("species_id" = "id",
                                            "species_id" = "{{name}}",
                                            "{{index}}" = "{{index}}"),
                               column = "my_seq")
  Sys.sleep(1)

  expect_equal(name_result, id_result)
  expect_equal(name_result, seq_result)

  # Expect an error if trying to use invalid database
  expect_error(lookup_tax_data(1:3, type = "seq_id", database = "not valid"),
               "not a valid database")

  # Expect an error if trying to use invalid database of sequence id
  expect_error(lookup_tax_data(1:3, type = "seq_id", database = "bold"),
               "not a valid database")

  # Expect an error if bad column name
  expect_error(lookup_tax_data(raw_data, column = "Not a column",
                               type = "seq_id"),
               'No column "Not a column" in input table')
  expect_error(lookup_tax_data(1:3, column = "Not a column", type = "seq_id"),
               'No item named "Not a column" in the following inputs:')
  expect_error(lookup_tax_data(1:3, column = 10, type = "seq_id"),
               'out of bounds for inputs:')

  # Failed downloads
  Sys.sleep(1)
  raw_data <- data.frame(species = c("Panthera leo",
                                     "not a taxon",
                                     "Ursus americanus"),
                         my_tax_id = c("9689", "not a taxon id 6", "9643"),
                         my_seq = c("AB548412", "777777777777", "DQ334818"),
                         species_id = c("A", "B", "C"))
  expect_warning(result <- lookup_tax_data(raw_data, type = "taxon_name", column = "species"))
  expect_warning(result <- lookup_tax_data(raw_data, type = "taxon_name", column = "species"), ask = FALSE)
  expect_equal(unname(result$data$query_data$taxon_id[2]), "unknown")
  expect_warning(result <- lookup_tax_data(raw_data, type = "taxon_id", column = "my_tax_id"))
  expect_equal(unname(result$data$query_data$taxon_id[2]), "unknown")
  expect_warning(result <- lookup_tax_data(raw_data, type = "seq_id", column = "my_seq"))
  expect_equal(unname(result$data$query_data$taxon_id[2]), "unknown")

  # Zero-length inputs produce empty taxmap objects
  expect_equal(taxon_names(lookup_tax_data(c())),          character(0))
  expect_equal(taxon_names(lookup_tax_data(character(0))), character(0))
  expect_equal(taxon_names(lookup_tax_data(list())),       character(0))
  expect_equal(taxon_names(lookup_tax_data(data.frame())), character(0))
})



test_that("Taxmap can be intialized from raw strings", {

  raw_data <- c(">var_1:A--var_2:9689--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__leo",
                ">var_1:B--var_2:9694--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__tigris",
                ">var_1:C--var_2:9643--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Ursus;S__americanus")
  result <- extract_tax_data(raw_data,
                             key = c(var_1 = "info", var_2 = "info", tax = "class"),
                             regex = "^>var_1:(.+)--var_2:(.+)--non_target--tax:(.+)$",
                             class_sep = ";", class_regex = "^(.+)__(.+)$",
                             class_key = c(my_rank = "info", tax_name = "taxon_name"))
  expect_equal(length(result$taxa), 8)
  expect_equal(result$data$tax_data$var_1, c("A", "B", "C"))
  expect_true("my_rank" %in% colnames(result$data$class_data))

  # test that different info with same sep can be used in classifications
  raw_data <- c("K;Mammalia;P;Carnivora;C;Felidae;G;Panthera;S;leo;",
                "K;Mammalia;P;Carnivora;C;Felidae;G;Panthera;S;tigris;",
                "K;Mammalia;P;Carnivora;C;Felidae;G;Ursus;S;americanus;")
  result <- extract_tax_data(raw_data,
                             key = c(tax = "class"),
                             regex = "(.*)",
                             class_regex = "(.+?);(.*?);",
                             class_key = c(my_rank = "info", tax_name = "taxon_name"))
  expect_equal(length(result$taxa), 8)
  expect_equal(length(result$roots()), 1)
  expect_true("my_rank" %in% colnames(result$data$class_data))

  # Test if it complains about failed matches
  raw_data <- c(">var_1:A--var_2:9689--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__leo",
                "not a match",
                ">var_1:C--var_2:9643--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Ursus;S__americanus")
  expect_warning(extract_tax_data(raw_data,
                                  key = c(var_1 = "info", var_2 = "info", tax = "class"),
                                  regex = "^>var_1:(.+)--var_2:(.+)--non_target--tax:(.+)$",
                                  class_sep = ";", class_regex = "^(.+)__(.+)$",
                                  class_key = c(my_rank = "info", tax_name = "taxon_name")),
                 "indexes failed to match the regex supplied")

  # Zero-length inputs produce empty taxmap objects
  expect_equal(taxon_names(extract_tax_data(c())),          character(0))
  expect_equal(taxon_names(extract_tax_data(character(0))), character(0))
  expect_equal(taxon_names(extract_tax_data(list())),       character(0))
})


test_that("Taxmap can be intialized from raw strings and lookup data", {
  skip_on_cran()

  raw_data <- c(">var_1:A--var_2:9689--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__leo",
                ">var_1:B--var_2:9694--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Panthera;S__tigris",
                ">var_1:C--var_2:9643--non_target--tax:K__Mammalia;P__Carnivora;C__Felidae;G__Ursus;S__americanus")

  # Test looking up variables extracted from raw strings
  Sys.sleep(1)
  result <- extract_tax_data(raw_data,
                             key = c(var_1 = "info", var_2 = "taxon_id", tax = "info"),
                             regex = "^>var_1:(.+)--var_2:(.+)--non_target--tax:(.+)$")
  expect_true("ncbi_id" %in% colnames(result$data$tax_data))
  expect_equivalent(result$roots(value = "taxon_names"), "cellular organisms")

})

Try the metacoder package in your browser

Any scripts or data that you put into this service are public.

metacoder documentation built on April 4, 2023, 9:08 a.m.