tests/testthat/test-downsample.R

# test_that("downsample_immundata downsamples single-cell repertoires and is deterministic with seed", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     cell_id = c("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8"),
#     sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"),
#     v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV1", "IGHV2", "IGHV3", "IGHV4"),
#     j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4"),
#     junction_aa = c("CARA", "CARB", "CARC", "CARD", "CARE", "CARF", "CARG", "CARH"),
#     locus = "IGH",
#     umi_count = c(10, 11, 12, 13, 14, 15, 16, 17)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = make_receptor_schema(
#       features = c("v_call", "j_call", "junction_aa"),
#       chains = "IGH"
#     ),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "umi_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample single-cell deterministic"
#   )
#
#   ds1 <- downsample_immundata(idata, n = 2, seed = 100)
#   ds2 <- downsample_immundata(idata, n = 2, seed = 100)
#
#   reps <- ds1$repertoires |> collect()
#   expect_true(all(reps$n_barcodes == 2))
#
#   sampled1 <- ds1$annotations |>
#     select(sample_id, imd_barcode) |>
#     distinct() |>
#     arrange(sample_id, imd_barcode) |>
#     collect()
#
#   sampled2 <- ds2$annotations |>
#     select(sample_id, imd_barcode) |>
#     distinct() |>
#     arrange(sample_id, imd_barcode) |>
#     collect()
#
#   expect_equal(sampled1, sampled2)
# })
#
# test_that("downsample_immundata downsampled bulk repertoires by count", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     sample_id = c("S1", "S1", "S2", "S2"),
#     v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
#     j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
#     junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
#     clone_count = c(8, 7, 6, 9)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = c("v_call", "j_call", "junction_aa"),
#     count_col = "clone_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample bulk count mode"
#   )
#
#   ds <- downsample_immundata(idata, n = 5, seed = 42)
#   reps <- ds$repertoires |> collect()
#
#   expect_equal(sort(reps$n_barcodes), c(5, 5))
# })
#
# test_that("downsample_immundata keeps paired chains intact", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     cell_id = c(
#       "a1", "a1", "a2", "a2", "a3", "a3",
#       "b1", "b1", "b2", "b2", "b3", "b3"
#     ),
#     sample_id = c(
#       "S1", "S1", "S1", "S1", "S1", "S1",
#       "S2", "S2", "S2", "S2", "S2", "S2"
#     ),
#     v_call = c("IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3", "IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3"),
#     j_call = c("IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3", "IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3"),
#     junction_aa = c("CA1", "CL1", "CA2", "CL2", "CA3", "CL3", "CB1", "CK1", "CB2", "CK2", "CB3", "CK3"),
#     locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL"),
#     umi_count = rep(100, 12)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = make_receptor_schema(
#       features = c("v_call", "j_call", "junction_aa"),
#       chains = c("IGH", "IGL")
#     ),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "umi_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample paired-chain integrity"
#   )
#
#   ds <- downsample_immundata(idata, n = 2, seed = 7)
#   reps <- ds$repertoires |> collect()
#   expect_true(all(reps$n_barcodes == 2))
#
#   chain_stats <- ds$annotations |>
#     collect() |>
#     summarise(
#       .by = c(sample_id, imd_barcode),
#       n_loci = n_distinct(locus),
#       n_rows = n()
#     )
#
#   expect_true(all(chain_stats$n_loci == 2))
#   expect_true(all(chain_stats$n_rows == 2))
# })
#
# test_that("downsample_immundata works on IG test data with proportion n", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   idata <- read_repertoires(
#     path = test_ig_data(),
#     schema = make_receptor_schema(
#       features = c("v_call", "j_call", "junction_aa"),
#       chains = c("IGH", "IGK|IGL")
#     ),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "duplicate_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   ) |>
#     mutate_immundata(cohort = "all")
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "cohort",
#     context = "downsample IG proportion"
#   )
#
#   n_before <- idata$repertoires |>
#     collect() |>
#     pull(n_barcodes)
#
#   ds <- downsample_immundata(idata, n = 0.5, seed = 123)
#   n_after <- ds$repertoires |>
#     collect() |>
#     pull(n_barcodes)
#
#   expect_equal(n_after, floor(n_before * 0.5))
#
#   chain_stats <- ds$annotations |>
#     collect() |>
#     summarise(
#       .by = imd_barcode,
#       n_loci = n_distinct(locus)
#     )
#   expect_true(all(chain_stats$n_loci == 2))
# })
#
# test_that("downsample_immundata with n = 1 keeps one receptor per repertoire", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     cell_id = c("s1c1", "s1c2", "s1c3", "s1c4", "s2c1", "s2c2", "s2c3", "s2c4"),
#     sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"),
#     v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV5", "IGHV5", "IGHV6"),
#     j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ5", "IGHJ5", "IGHJ6"),
#     junction_aa = c("A", "A", "B", "C", "D", "E", "E", "F"),
#     locus = "IGH",
#     umi_count = c(12, 11, 10, 9, 8, 7, 6, 5)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = make_receptor_schema(
#       features = c("v_call", "j_call", "junction_aa"),
#       chains = "IGH"
#     ),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "umi_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample n=1 receptor mode"
#   )
#
#   ds1 <- downsample_immundata(idata, n = 1, seed = 321)
#   ds2 <- downsample_immundata(idata, n = 1, seed = 321)
#
#   receptor_stats <- ds1$annotations |>
#     collect() |>
#     summarise(
#       .by = sample_id,
#       n_receptors = n_distinct(imd_receptor_id)
#     ) |>
#     arrange(sample_id)
#
#   expect_true(all(receptor_stats$n_receptors == 1))
#
#   reps <- ds1$repertoires |>
#     collect() |>
#     arrange(sample_id)
#   expect_true(all(reps$n_receptors == 1))
#
#   sampled1 <- ds1$annotations |>
#     select(sample_id, imd_receptor_id) |>
#     distinct() |>
#     arrange(sample_id, imd_receptor_id) |>
#     collect()
#
#   sampled2 <- ds2$annotations |>
#     select(sample_id, imd_receptor_id) |>
#     distinct() |>
#     arrange(sample_id, imd_receptor_id) |>
#     collect()
#
#   expect_equal(sampled1, sampled2)
# })
#
# test_that("downsample_immundata validates n and handles no-repertoire fallback", {
#   idata <- get_test_idata_tsv_no_metadata()
#
#   expect_error(
#     downsample_immundata(idata, n = 2.5),
#     "integer count"
#   )
#
#   n_before <- idata$annotations |>
#     distinct(imd_barcode) |>
#     collect() |>
#     nrow()
#
#   ds <- downsample_immundata(idata, n = 0.1, seed = 1)
#
#   n_after <- ds$annotations |>
#     distinct(imd_barcode) |>
#     collect() |>
#     nrow()
#
#   expect_lt(n_after, n_before)
#   expect_null(ds$repertoires)
# })
#
# test_that("downsample_immundata warns and keeps repertoire unchanged when n exceeds available units", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"),
#     sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"),
#     v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV1", "IGHV2", "IGHV3"),
#     j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ1", "IGHJ2", "IGHJ3"),
#     junction_aa = c("A1", "A2", "A3", "B1", "B2", "B3"),
#     locus = "IGH",
#     umi_count = c(1, 1, 1, 1, 1, 1)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "umi_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample n exceeds units"
#   )
#
#   reps_before <- idata$repertoires |>
#     collect() |>
#     arrange(sample_id)
#
#   ds <- NULL
#   expect_warning(
#     ds <- downsample_immundata(idata, n = 10, seed = 42),
#     "returned unchanged"
#   )
#
#   reps_after <- ds$repertoires |>
#     collect() |>
#     arrange(sample_id)
#
#   expect_equal(reps_after$n_barcodes, reps_before$n_barcodes)
# })
#
# test_that("downsample_immundata supports count-mode proportion downsampling", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     sample_id = c("S1", "S1", "S2", "S2"),
#     v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
#     j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
#     junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
#     clone_count = c(8, 7, 6, 9)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = c("v_call", "j_call", "junction_aa"),
#     count_col = "clone_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample count proportion"
#   )
#
#   ds <- downsample_immundata(idata, n = 0.5, seed = 101)
#   reps <- ds$repertoires |> collect()
#
#   expect_equal(sort(reps$n_barcodes), c(7, 7))
# })
#
# test_that("downsample_immundata is deterministic in count mode with seed", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     sample_id = c("S1", "S1", "S2", "S2"),
#     v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
#     j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
#     junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
#     clone_count = c(8, 7, 6, 9)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = c("v_call", "j_call", "junction_aa"),
#     count_col = "clone_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample count deterministic"
#   )
#
#   ds1 <- downsample_immundata(idata, n = 5, seed = 222)
#   ds2 <- downsample_immundata(idata, n = 5, seed = 222)
#
#   ann1 <- ds1$annotations |>
#     select(sample_id, imd_barcode, imd_n_chains) |>
#     arrange(sample_id, imd_barcode) |>
#     collect()
#   ann2 <- ds2$annotations |>
#     select(sample_id, imd_barcode, imd_n_chains) |>
#     arrange(sample_id, imd_barcode) |>
#     collect()
#
#   expect_equal(ann1, ann2)
# })
#
# test_that("downsample_immundata errors when proportion results in zero target", {
#   idata <- get_test_idata_tsv_no_metadata()
#
#   expect_error(
#     downsample_immundata(idata, n = 0.0001, seed = 1),
#     "Increase `n`"
#   )
# })
#
# test_that("downsample_immundata produces consistent imd_count and imd_proportion invariants", {
#   output_dir <- create_test_output_dir()
#   on.exit(cleanup_output_dir(output_dir))
#
#   test_data <- data.frame(
#     cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"),
#     sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"),
#     v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV1", "IGHV2", "IGHV2"),
#     j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ1", "IGHJ2", "IGHJ2"),
#     junction_aa = c("A1", "A1", "A2", "B1", "B2", "B2"),
#     locus = "IGH",
#     umi_count = c(10, 11, 12, 13, 14, 15)
#   )
#
#   temp_file <- tempfile(fileext = ".tsv")
#   readr::write_tsv(test_data, temp_file)
#   on.exit(unlink(temp_file), add = TRUE)
#
#   idata <- read_repertoires(
#     path = temp_file,
#     schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"),
#     barcode_col = "cell_id",
#     locus_col = "locus",
#     umi_col = "umi_count",
#     output_folder = output_dir,
#     preprocess = NULL,
#     postprocess = NULL,
#     rename_columns = NULL
#   )
#   idata <- agg_repertoires_with_integrity(
#     idata,
#     schema = "sample_id",
#     context = "downsample count/proportion invariants"
#   )
#
#   ds <- downsample_immundata(idata, n = 2, seed = 33)
#
#   reps <- ds$repertoires |> collect()
#   ann <- ds$annotations |> collect()
#
#   receptor_stats <- ann |>
#     select(imd_repertoire_id, imd_receptor_id, imd_count, imd_proportion) |>
#     distinct() |>
#     summarise(
#       .by = imd_repertoire_id,
#       sum_count = sum(imd_count),
#       sum_prop = sum(imd_proportion)
#     ) |>
#     arrange(imd_repertoire_id)
#
#   reps <- reps |> arrange(imd_repertoire_id)
#
#   expect_equal(receptor_stats$sum_count, reps$n_barcodes)
#   expect_true(all(abs(receptor_stats$sum_prop - 1) < 1e-8))
# })

Try the immundata package in your browser

Any scripts or data that you put into this service are public.

immundata documentation built on April 4, 2026, 9:09 a.m.