tests/testthat/test-dplyr-distinct.R

# set up quiet functions for testing reasons
quiet_collect <- function(x){
  purrr_collect <- purrr::quietly(collect.data_request)
  purrr_collect(x) |> 
    purrr::pluck("result")
}
galah_config(email = "ala4r@ala.org.au")

test_that("`group_by()` without `distinct()` returns occurrences, not species", {
  skip_if_offline(); skip_on_ci()
  query <- galah_call() |>
    filter(year == 2024,
           genus == "Crinia")
  expected_n <- query |> 
    count() |>
    quiet_collect() 
  query_final <- query |>
    group_by(speciesID)
  expect_equal(query_final$type, "occurrences")
  x <- quiet_collect(query_final)
  expect_s3_class(x,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(x), expected_n$count)
  expect_true(length(unique(x$taxonConceptID)) < nrow(x))
})

test_that("distinct() with no arguments and no `group_by()` returns occurrences (i.e. does nothing)", {
  skip_if_offline(); skip_on_ci()
  query <- galah_call() |>
    filter(year == 2024,
           genus == "Crinia")
  expected_n <- query |> 
    count() |>
    quiet_collect() 
  query_final <- query |>
    distinct()
  expect_equal(query_final$type, "occurrences")
  x <- quiet_collect(query_final)
  expect_s3_class(x,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(x), expected_n$count)
  expect_true(length(unique(x$taxonConceptID)) < nrow(x))
})

test_that("`group_by() |> distinct(.keep_all = FALSE)` uses occurrences-count, but *doesn't* return counts", {
  skip_if_offline(); skip_on_ci()
  x <- galah_call() |>
    filter(year == 2024,
           genus == "Crinia") |>
    group_by(speciesID) |>
    distinct() |>
    quiet_collect()
  expect_s3_class(x,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(ncol(x), 1)
  expect_equal(colnames(x), "speciesID")
  expect_equal(length(unique(x$speciesID)),
               nrow(x))
})

test_that("`group_by() |> distinct(.keep_all = TRUE)` converts type from occurrences to species", {
  skip_if_offline(); skip_on_ci()
  query <- galah_call() |>
    filter(year == 2024,
           genus == "Crinia") |>
    group_by(speciesID) |>
    distinct(.keep_all = TRUE) |>
    collapse()
  expect_equal(query$type, "data/species")
  x <- quiet_collect(query)
  expect_s3_class(x,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(length(unique(x$species)),
               nrow(x))
})

test_that("distinct(variable, .keep_all = FALSE) returns field values", {
  skip_if_offline(); skip_on_ci()
  result <- galah_call() |>
    distinct(cl11226) |>
    quiet_collect()
  # should return values for that field
  expect_s3_class(result,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(colnames(result), "cl11226")
  expect_gte(nrow(result), 10)
})

test_that("`distinct(.keep_all = TRUE)` sets species queries", {
  skip_if_offline(); skip_on_ci()
  result <- galah_call() |>
    identify("Osphranter") |>
    distinct(speciesID, .keep_all = TRUE) |>
    quiet_collect()
  expect_gte(nrow(result), 4)
  expect_s3_class(result,
                  c("tbl_df", "tbl", "data.frame"))
  expect_contains(colnames(result),
                  c("species", "species_name", "kingdom"))
})

test_that("`distinct(.keep_all = TRUE)` accepts non-species-level groupings", {
  skip_if_offline(); skip_on_ci()
  genera <- galah_call() |>
    identify("Limnodynastidae") |>
    distinct(genusID, .keep_all = TRUE) |>
    quiet_collect()
  expect_s3_class(genera, c("tbl_df", "tbl", "data.frame"))
  expect_true(nrow(genera) > 4 & nrow(genera) < 10)
  all(genera$taxon_rank == "genus") |>
    expect_true()
})

test_that("`distinct(variable) |> count()` can be used to count the number of levels", {
  # NOTE: This is set to `basisOfRecord` because the number of values is easy to verify
  # taxonomic identifiers are more slippery and therefore less reliable to test
  skip_if_offline(); skip_on_ci()
  levels_all <- galah_call() |>
    distinct(basisOfRecord) |>
    quiet_collect()
  levels_count <- galah_call() |>
    distinct(basisOfRecord) |>
    count() |>
    quiet_collect()
  expect_s3_class(levels_count,
                  c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(levels_count), 1)
  expect_equal(nrow(levels_all), levels_count$count)
})

test_that("`group_by(something) |> distinct(speciesID) |> count()` gives grouped number of categories", {
  skip_if_offline(); skip_on_ci()
  result <- galah_call() |>
    identify("perameles") |>
    group_by(basisOfRecord) |>
    distinct(speciesID) |>
    count() |>
    collect()
  expect_equal(colnames(result),
               c("basisOfRecord", "count"))
  all(result$count < 10) |> 
    expect_true()
})

# FIXME: not implemented
# test_that("`add_count() |> distinct()` adds record counts to each species", {
#  skip_if_offline(); skip_on_ci()
#  query <- galah_call() |>
#    filter(year == 2024,
#           genus == "Crinia") |>
#    group_by(speciesID) |>
#    add_count() |>
#    distinct(.keep_all = TRUE) |>
#    collapse()
#  expect_equal(query$type, "data/species")
#  x <- quiet_collect(query)
#  expect_s3_class(x,
#                  c("tbl_df", "tbl", "data.frame"))
#  expect_equal(length(unique(x$speciesID)),
#               nrow(x))
# })

# test_that("add_count() without `distinct()` just adds a column of 1s", {
#   skip("not built")
# })

rm(quiet_collect)

Try the galah package in your browser

Any scripts or data that you put into this service are public.

galah documentation built on Feb. 11, 2026, 9:11 a.m.