Nothing
# ============================================================================
# TEST 1: Single-chain data with duplicated receptors
# ============================================================================
test_that("agg_repertoires counts single-chain receptors correctly", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Create synthetic single-chain data
# Sample1: 3 cells, 2 unique receptors (cells 1&2 share receptor)
# Sample2: 2 cells, 2 unique receptors
test_data <- data.frame(
cell_id = c("cell1", "cell2", "cell3", "cell4", "cell5"),
sample_id = c("Sample1", "Sample1", "Sample1", "Sample2", "Sample2"),
v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV3", "IGHV4"),
j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4"),
junction_aa = c("CARW", "CARW", "CBRW", "CCRW", "CDRW"), # cell1 & cell2 identical
locus = c("IGH", "IGH", "IGH", "IGH", "IGH"),
umi_count = c(100, 150, 200, 250, 300)
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
# Read and aggregate
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = "IGH"
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir,
preprocess = NULL,
postprocess = NULL,
rename_columns = NULL
)
# Aggregate repertoires by sample_id
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "single-chain duplicated receptors"
)
repertoires <- idata_agg$repertoires |> collect()
annotations <- idata_agg$annotations |> collect()
# Expected results:
# Sample1: n_barcodes = 3, n_receptors = 2 (cells 1&2 share receptor)
# Sample2: n_barcodes = 2, n_receptors = 2
sample1_stats <- repertoires |> filter(sample_id == "Sample1")
sample2_stats <- repertoires |> filter(sample_id == "Sample2")
expect_equal(sample1_stats$n_barcodes, 3,
info = "Sample1 should have 3 barcodes"
)
expect_equal(sample1_stats$n_receptors, 2,
info = "Sample1 should have 2 unique receptors"
)
expect_equal(sample2_stats$n_barcodes, 2,
info = "Sample2 should have 2 barcodes"
)
expect_equal(sample2_stats$n_receptors, 2,
info = "Sample2 should have 2 unique receptors"
)
# Verify receptor counts within repertoires
receptor_counts <- annotations |>
group_by(sample_id, imd_receptor_id) |>
summarise(
total_count1 = first(imd_count),
total_count2 = last(imd_count),
.groups = "drop"
)
# The shared receptor in Sample1 should have count = 2
sample1_counts <- receptor_counts |>
filter(sample_id == "Sample1") |>
arrange(desc(total_count1))
expect_equal(sample1_counts$total_count1[1], 2,
info = "Shared receptor should have count of 2"
)
sample1_counts2 <- receptor_counts |>
filter(sample_id == "Sample1") |>
arrange(desc(total_count2))
expect_equal(sample1_counts$total_count2[1], 2,
info = "Shared receptor should have count of 2"
)
})
# ============================================================================
# TEST 2: Paired-chain data - potential double counting issue
# ============================================================================
test_that("agg_repertoires identifies double-counting in paired-chain data", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Create synthetic paired-chain data
# Each receptor appears TWICE (once per chain)
test_data <- data.frame(
cell_id = c("cell1", "cell1", "cell2", "cell2", "cell3", "cell3"),
sample_id = c("Sample1", "Sample1", "Sample1", "Sample1", "Sample2", "Sample2"),
v_call = c("IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3"),
j_call = c("IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3"),
junction_aa = c("CARW", "CASL", "CBRW", "CBSL", "CCRW", "CCSL"),
locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL"),
umi_count = c(100, 100, 150, 150, 200, 200) # Same count for both chains
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
# Read with paired-chain schema
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = c("IGH", "IGL")
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir,
preprocess = NULL,
postprocess = NULL,
rename_columns = NULL
)
# Aggregate repertoires
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "paired-chain double-counting check"
)
repertoires <- idata_agg$repertoires |> collect()
annotations <- idata_agg$annotations |> collect()
# Current behavior (with double-counting bug):
# Sample1: n_barcodes will be 4 (2 cells × 2 chains) - WRONG!
# Sample2: n_barcodes will be 2 (1 cell × 2 chains) - WRONG!
# The correct values should be:
# Sample1: n_barcodes = 2 (actual cells)
# Sample2: n_barcodes = 1 (actual cell)
sample1_stats <- repertoires |> filter(sample_id == "Sample1")
sample2_stats <- repertoires |> filter(sample_id == "Sample2")
# These tests document the CURRENT behavior (with double-counting)
# They should FAIL when the bug is fixed
current_behavior_sample1 <- sample1_stats$n_barcodes
current_behavior_sample2 <- sample2_stats$n_barcodes
# Test for n_receptors (this should be correct)
expect_equal(sample1_stats$n_receptors, 2,
info = "Sample1 should have 2 unique receptors"
)
expect_equal(sample2_stats$n_receptors, 1,
info = "Sample2 should have 1 unique receptor"
)
expect_equal(sample1_stats$n_barcodes, 2)
expect_equal(sample2_stats$n_barcodes, 1)
})
# ============================================================================
# TEST 3: Mixed scenario - paired-chain with shared receptors
# ============================================================================
test_that("agg_repertoires handles paired-chain data with shared receptors", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Create paired-chain data where some cells share receptors
test_data <- data.frame(
cell_id = c("cell1", "cell1", "cell2", "cell2", "cell3", "cell3"),
sample_id = c("Sample1", "Sample1", "Sample1", "Sample1", "Sample1", "Sample1"),
v_call = c("IGHV1", "IGLV1", "IGHV1", "IGLV1", "IGHV2", "IGLV2"),
j_call = c("IGHJ1", "IGLJ1", "IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2"),
junction_aa = c("CARW", "CASL", "CARW", "CASL", "CBRW", "CBSL"),
locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL"),
umi_count = c(100, 100, 150, 150, 200, 200)
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = c("IGH", "IGL")
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir,
preprocess = NULL,
postprocess = NULL,
rename_columns = NULL
)
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "paired-chain shared receptors"
)
repertoires <- idata_agg$repertoires |> collect()
annotations <- idata_agg$annotations |> collect()
# Expected (correct) results:
# - 3 actual cells
# - 2 unique receptors (cells 1&2 share)
# Current (incorrect due to double-counting):
# - n_barcodes will show 6 (3 cells × 2 chains)
expect_equal(repertoires$n_receptors, 2,
info = "Should have 2 unique receptors"
)
# Check receptor counts
receptor_counts <- annotations |>
select(imd_receptor_id, imd_count) |>
distinct()
# The shared receptor should have count = 2 × 2 = 4 (double-counted)
# Should be 2 (two cells sharing the receptor)
shared_receptor_count <- max(receptor_counts$imd_count)
expect_equal(shared_receptor_count, 2)
})
# ============================================================================
# TEST 4: Proportions calculation with double-counting
# ============================================================================
test_that("agg_repertoires proportions are affected by double-counting", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Simple paired-chain data for clear proportion calculation
test_data <- data.frame(
cell_id = c("cell1", "cell1", "cell2", "cell2"),
sample_id = c("Sample1", "Sample1", "Sample1", "Sample1"),
v_call = c("IGHV1", "IGLV1", "IGHV2", "IGLV2"),
j_call = c("IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2"),
junction_aa = c("CARW", "CASL", "CBRW", "CBSL"),
locus = c("IGH", "IGL", "IGH", "IGL"),
umi_count = c(100, 100, 100, 100) # Equal counts
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = c("IGH", "IGL")
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir,
preprocess = NULL,
postprocess = NULL,
rename_columns = NULL
)
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "paired-chain proportions"
)
annotations <- idata_agg$annotations |> collect()
# Get unique proportions
proportions <- annotations |>
select(imd_receptor_id, imd_proportion) |>
distinct()
expect_equal(unique(proportions$imd_proportion), 0.5)
})
# ============================================================================
# TEST 5: Verify n_repertoires calculation (should be correct)
# ============================================================================
test_that("agg_repertoires n_repertoires calculation is correct for paired data", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Create data where one receptor appears in multiple samples
test_data <- data.frame(
cell_id = c("cell1", "cell1", "cell2", "cell2", "cell3", "cell3"),
sample_id = c("Sample1", "Sample1", "Sample2", "Sample2", "Sample3", "Sample3"),
v_call = c("IGHV1", "IGLV1", "IGHV1", "IGLV1", "IGHV2", "IGLV2"),
j_call = c("IGHJ1", "IGLJ1", "IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2"),
junction_aa = c("CARW", "CASL", "CARW", "CASL", "CBRW", "CBSL"),
locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL"),
umi_count = rep(100, 6)
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = c("IGH", "IGL")
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir,
preprocess = NULL,
postprocess = NULL,
rename_columns = NULL
)
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "paired-chain n_repertoires"
)
annotations <- idata_agg$annotations |> collect()
# Check n_repertoires for the shared receptor
n_repertoires_vals <- annotations |>
select(imd_receptor_id, n_repertoires) |>
distinct()
# First receptor appears in Sample1 and Sample2 (n_repertoires = 2)
# Second receptor appears only in Sample3 (n_repertoires = 1)
expect_true(2 %in% n_repertoires_vals$n_repertoires,
info = "Shared receptor should appear in 2 repertoires"
)
expect_true(1 %in% n_repertoires_vals$n_repertoires,
info = "Unique receptor should appear in 1 repertoire"
)
})
test_that("agg_repertoires preserves second chain data (no NAs)", {
output_dir <- create_test_output_dir()
on.exit(cleanup_output_dir(output_dir))
# Create paired-chain data
# Cell1: IGH + IGL
test_data <- data.frame(
cell_id = c("cell1", "cell1"),
sample_id = c("Sample1", "Sample1"),
v_call = c("IGHV1", "IGLV1"),
j_call = c("IGHJ1", "IGLJ1"),
junction_aa = c("CARW", "CASL"),
locus = c("IGH", "IGL"),
umi_count = c(100, 100)
)
temp_file <- tempfile(fileext = ".tsv")
readr::write_tsv(test_data, temp_file)
on.exit(unlink(temp_file), add = TRUE)
idata <- read_repertoires(
path = temp_file,
schema = make_receptor_schema(
features = c("v_call", "j_call", "junction_aa"),
chains = c("IGH", "IGL")
),
barcode_col = "cell_id",
locus_col = "locus",
umi_col = "umi_count",
output_folder = output_dir
)
# Run aggregation
idata_agg <- agg_repertoires_with_integrity(
idata,
schema = "sample_id",
context = "paired-chain no-NA preservation"
)
annotations <- idata_agg$annotations |> collect()
# ----------------------------------------------------------------------
# CHECK 1: Row Count
# The input had 2 rows (1 cell x 2 chains).
# The output MUST have 2 rows.
# (The bug caused this to drop to 1 row).
# ----------------------------------------------------------------------
expect_equal(nrow(annotations), 2,
info = "Annotation table should retain rows for both chains (IGH and IGL)"
)
# ----------------------------------------------------------------------
# CHECK 2: Second Chain Data Presence
# Filter for the Light Chain row and ensure V-call is present (not NA)
# ----------------------------------------------------------------------
igl_row <- annotations |> filter(locus == "IGL")
expect_equal(nrow(igl_row), 1,
info = "Should find exactly one row for the IGL chain"
)
expect_false(is.na(igl_row$v_call),
info = "IGL row should have valid v_call data (not NA)"
)
expect_equal(igl_row$v_call, "IGLV1",
info = "IGL v_call should match input data"
)
# ----------------------------------------------------------------------
# CHECK 3: Statistics Mapping
# Ensure the calculated stats (which are per-receptor) are mapped
# correctly to BOTH chain rows.
# ----------------------------------------------------------------------
# Both IGH and IGL rows belong to the same receptor, so they should
# both have the same imd_count and imd_proportion.
igh_row <- annotations |> filter(locus == "IGH")
expect_equal(igh_row$imd_count, igl_row$imd_count,
info = "Both chains of the same receptor should share the same count stats"
)
expect_equal(igh_row$imd_repertoire_id, igl_row$imd_repertoire_id,
info = "Both chains should belong to the same repertoire ID"
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.