tests/testthat/test-purity-filter.R

#------------------------------------------------------------------------------#
# Global vars
#------------------------------------------------------------------------------#

df <- tibble::tibble(
    chr = c(
        "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
    ),
    integration_locus = c(
        121249, 251227, 645551, 732938,
        775536, 846681, 1029785, 1036835,
        121249, 251227, 645551, 732938,
        775536, 846681, 1029785, 1036835,
        121249, 251227, 645551, 732938,
        775536, 846681, 1029785, 1036835,
        121249, 251227, 645551, 732938,
        775536, 846681, 1029785, 1036835,
        121249, 251227, 645551, 732938,
        775536, 846681, 1029785, 1036835
    ),
    strand = c(
        "+", "+", "+", "+", "+", "+", "-", "+", "+",
        "+", "+", "+", "+", "+", "-", "+", "+", "+",
        "+", "+", "+", "+", "-", "+", "+", "+", "+",
        "+", "+", "+", "-", "+", "+", "+", "+", "+",
        "+", "+", "-", "+"
    ),
    GeneName = c(
        "LOC729737", "LOC100132287", "LOC100133331",
        "LOC100288069", "LINC01128", "LOC100130417",
        "C1orf159", "C1orf159", "LOC729737",
        "LOC100132287", "LOC100133331",
        "LOC100288069", "LINC01128", "LOC100130417",
        "C1orf159", "C1orf159", "LOC729737",
        "LOC100132287", "LOC100133331",
        "LOC100288069", "LINC01128", "LOC100130417",
        "C1orf159", "C1orf159", "LOC729737",
        "LOC100132287", "LOC100133331",
        "LOC100288069", "LINC01128", "LOC100130417",
        "C1orf159", "C1orf159", "LOC729737",
        "LOC100132287", "LOC100133331",
        "LOC100288069", "LINC01128", "LOC100130417",
        "C1orf159", "C1orf159"
    ),
    GeneStrand = c(
        "-", "+", "-", "-", "+", "-", "-", "-",
        "-", "+", "-", "-", "+", "-", "-", "-",
        "-", "+", "-", "-", "+", "-", "-", "-",
        "-", "+", "-", "-", "+", "-", "-", "-",
        "-", "+", "-", "-", "+", "-", "-", "-"
    ),
    CellMarker = c(
        "CD13", "CD13", "CD13", "CD13", "CD13",
        "CD13", "CD13", "CD13", "CD14", "CD14",
        "CD14", "CD14", "CD14", "CD14", "CD14",
        "CD14", "CD19", "CD19", "CD19", "CD19",
        "CD19", "CD19", "CD19", "CD19", "CD3",
        "CD3", "CD3", "CD3", "CD3", "CD3", "CD3",
        "CD3", "CD34", "CD34", "CD34", "CD34",
        "CD34", "CD34", "CD34", "CD34"
    ),
    Tissue = c(
        "BM", "BM", "BM", "BM", "BM", "BM", "BM",
        "BM", "PB", "PB", "PB", "PB", "PB", "PB",
        "PB", "PB", "PB", "PB", "PB", "PB", "PB",
        "PB", "PB", "PB", "PB", "PB", "PB", "PB",
        "PB", "PB", "PB", "PB", "BM", "BM", "BM",
        "BM", "BM", "BM", "BM", "BM"
    ),
    TimePoint = c(
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01"
    ),
    Value = c(
        1, 1, 1, 1000, 1, 10, 3, 3, 1, 1000, 1000, 500,
        1, 12, 30, 1000, 1, 1, 500, 1, 10, 14, 30, 90,
        1, 1, 1, 1, 10, 9, 30, 90, 1000, 1, 1, 300,
        10, 8, 1000, 3
    )
)

expected_output_sc <- tibble::tibble(
    chr = c(
        "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1"
    ),
    integration_locus = c(
        121249, 251227,
        645551, 645551,
        732938, 732938,
        732938, 775536,
        775536, 775536,
        775536, 775536,
        846681, 846681,
        846681, 846681,
        846681, 1029785,
        1036835
    ),
    strand = c(
        "+", "+", "+", "+", "+", "+",
        "+", "+", "+", "+", "+", "+",
        "+", "+", "+", "+", "+", "-",
        "+"
    ),
    GeneName = c(
        "LOC729737", "LOC100132287",
        "LOC100133331",
        "LOC100133331",
        "LOC100288069",
        "LOC100288069",
        "LOC100288069",
        "LINC01128",
        "LINC01128",
        "LINC01128",
        "LINC01128", "LINC01128",
        "LOC100130417",
        "LOC100130417",
        "LOC100130417",
        "LOC100130417",
        "LOC100130417", "C1orf159",
        "C1orf159"
    ),
    GeneStrand = c(
        "-", "+", "-", "-", "-",
        "-", "-", "+", "+", "+",
        "+", "+", "-", "-", "-",
        "-", "-", "-", "-"
    ),
    TimePoint = c(
        "01", "01", "01", "01",
        "01", "01", "01", "01",
        "01", "01", "01", "01",
        "01", "01", "01", "01",
        "01", "01", "01"
    ),
    CellMarker = c(
        "CD34", "CD14", "CD14",
        "CD19", "CD13", "CD14",
        "CD34", "CD13", "CD14",
        "CD19", "CD3", "CD34",
        "CD13", "CD14", "CD19",
        "CD3", "CD34", "CD34",
        "CD14"
    ),
    Tissue = c(
        "BM", "PB", "PB", "PB", "BM",
        "PB", "BM", "BM", "PB", "PB",
        "PB", "BM", "BM", "PB", "PB",
        "PB", "BM", "BM", "PB"
    ),
    Value = c(
        1000, 1000, 1000, 500, 1000,
        500, 300, 1, 1, 10, 10, 10, 10,
        12, 14, 9, 8, 1000, 1000
    )
)
expected_output_ab <- tibble::tibble(
    chr = c(
        "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1", "1", "1", "1",
        "1", "1", "1", "1"
    ),
    integration_locus = c(
        121249, 251227, 645551, 645551,
        732938, 732938, 732938, 775536,
        775536, 846681, 846681, 846681,
        1029785, 1029785, 1029785, 1036835,
        1036835, 1036835
    ),
    strand = c(
        "+", "+", "+", "+", "+", "+", "+", "+", "+",
        "+", "+", "+", "-", "-", "-", "+", "+", "+"
    ),
    GeneName = c(
        "LOC729737", "LOC100132287", "LOC100133331",
        "LOC100133331", "LOC100288069",
        "LOC100288069", "LOC100288069", "LINC01128",
        "LINC01128", "LOC100130417", "LOC100130417",
        "LOC100130417", "C1orf159", "C1orf159",
        "C1orf159", "C1orf159", "C1orf159",
        "C1orf159"
    ),
    GeneStrand = c(
        "-", "+", "-", "-", "-", "-", "-", "+",
        "+", "-", "-", "-", "-", "-", "-", "-",
        "-", "-"
    ),
    TimePoint = c(
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01", "01", "01", "01",
        "01", "01", "01", "01"
    ),
    CellMarker = c(
        "CD34", "CD14", "CD14", "CD19", "CD13",
        "CD14", "CD34", "CD19", "CD3", "CD13",
        "CD19", "CD3", "CD19", "CD3", "CD34",
        "CD14", "CD19", "CD3"
    ),
    Tissue = c(
        "BM", "PB", "PB", "PB", "BM", "PB", "BM",
        "PB", "PB", "BM", "PB", "PB", "PB", "PB",
        "BM", "PB", "PB", "PB"
    ),
    Value = c(
        43.0477830391735, 28.2167042889391,
        28.2167042889391, 77.2797527047913,
        98.0392156862745, 14.1083521444695,
        12.914334911752, 1.54559505409583,
        6.99300699300699, 0.980392156862745,
        2.16383307573416, 6.29370629370629,
        4.63678516228748, 20.979020979021,
        43.0477830391735, 28.2167042889391,
        13.9103554868624, 62.9370629370629
    )
)

#------------------------------------------------------------------------------#
# Tests
#------------------------------------------------------------------------------#
test_that("purity_filter produces expected output - sc", {
    purity_filtered <- purity_filter(
        x = df,
        aggregation_key = c(
            "CellMarker",
            "Tissue",
            "TimePoint"
        ),
        group_key = c("CellMarker", "Tissue"),
        min_value = 0, impurity_threshold = 10,
        by_timepoint = TRUE, value_column = "Value"
    )
    expect_equal(purity_filtered, expected_output_sc)
})

test_that("purity_filter produces expected output - abundance", {
    abund <- compute_abundance(x = df, columns = "Value", key = c(
        "CellMarker",
        "Tissue",
        "TimePoint"
    ))
    purity_filtered <- purity_filter(
        x = abund,
        aggregation_key = c(
            "CellMarker",
            "Tissue",
            "TimePoint"
        ),
        group_key = c("CellMarker", "Tissue"),
        min_value = 0,
        impurity_threshold = 10,
        by_timepoint = TRUE,
        value_column = "Value_PercAbundance"
    )
    expect_equal(purity_filtered, expected_output_ab)
})

test_that("purity_filter produces expected output - join", {
    purity_filtered <- purity_filter(
        x = df,
        aggregation_key = c(
            "CellMarker",
            "Tissue",
            "TimePoint"
        ),
        group_key = c("HematoLineage"),
        min_value = 3,
        impurity_threshold = 10,
        by_timepoint = TRUE,
        value_column = "Value"
    )
    expected <- tibble::tibble(
        chr = c(
            "1", "1", "1", "1", "1", "1", "1", "1",
            "1", "1", "1", "1", "1", "1"
        ),
        integration_locus = c(
            645551, 645551, 732938, 732938,
            775536, 775536, 846681, 846681,
            846681, 1029785, 1036835, 1036835,
            121249, 251227
        ),
        strand = c(
            "+", "+", "+", "+", "+", "+", "+", "+", "+",
            "-", "+", "+", "+", "+"
        ),
        GeneName = c(
            "LOC100133331", "LOC100133331",
            "LOC100288069", "LOC100288069",
            "LINC01128", "LINC01128", "LOC100130417",
            "LOC100130417", "LOC100130417", "C1orf159",
            "C1orf159", "C1orf159", "LOC729737",
            "LOC100132287"
        ),
        GeneStrand = c(
            "-", "-", "-", "-", "+", "+", "-", "-",
            "-", "-", "-", "-", "-", "+"
        ),
        TimePoint = c(
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01", "01", "01", "01", "01"
        ),
        HematoLineage = c(
            "Lymphoid", "Myeloid", "CD34",
            "Myeloid", "CD34", "Lymphoid",
            "CD34", "Lymphoid", "Myeloid",
            "CD34", "Lymphoid", "Myeloid",
            "CD34", "Myeloid"
        ),
        Value = c(
            501, 1001, 300, 1500, 10, 20, 8, 23, 22,
            1000, 180, 1003, 1000, 1001
        )
    )
    expect_equal(purity_filtered, expected)
})

test_that("purity_filter produces expected output - group selection", {
    ## Vector
    purity_filtered <- purity_filter(
        x = df,
        aggregation_key = c(
            "CellMarker",
            "Tissue",
            "TimePoint"
        ),
        group_key = c("CellMarker", "Tissue"),
        selected_groups = c("CD34", "CD13"),
        min_value = 0,
        impurity_threshold = 10,
        by_timepoint = TRUE,
        value_column = "Value"
    )
    expected <- tibble::tibble(
        chr = c(
            "1", "1", "1", "1", "1", "1", "1", "1",
            "1", "1", "1", "1", "1", "1", "1", "1",
            "1", "1", "1", "1", "1", "1", "1", "1",
            "1", "1", "1", "1", "1", "1", "1", "1",
            "1", "1", "1", "1", "1", "1"
        ),
        integration_locus = c(
            121249, 251227, 251227, 645551,
            645551, 732938, 732938, 775536,
            775536, 846681, 846681, 1029785,
            1036835, 1036835, 121249, 121249,
            121249, 251227, 251227, 251227,
            645551, 645551, 645551, 732938,
            732938, 732938, 775536, 775536,
            775536, 846681, 846681, 846681,
            1029785, 1029785, 1029785, 1036835,
            1036835, 1036835
        ),
        strand = c(
            "+", "+", "+", "+", "+", "+", "+", "+", "+",
            "+", "+", "-", "+", "+", "+", "+", "+", "+",
            "+", "+", "+", "+", "+", "+", "+", "+", "+",
            "+", "+", "+", "+", "+", "-", "-", "-", "+",
            "+", "+"
        ),
        GeneName = c(
            "LOC729737", "LOC100132287", "LOC100132287",
            "LOC100133331", "LOC100133331",
            "LOC100288069", "LOC100288069",
            "LINC01128", "LINC01128", "LOC100130417",
            "LOC100130417", "C1orf159", "C1orf159",
            "C1orf159", "LOC729737", "LOC729737",
            "LOC729737", "LOC100132287", "LOC100132287",
            "LOC100132287", "LOC100133331",
            "LOC100133331", "LOC100133331",
            "LOC100288069", "LOC100288069",
            "LOC100288069", "LINC01128", "LINC01128",
            "LINC01128", "LOC100130417", "LOC100130417",
            "LOC100130417", "C1orf159", "C1orf159",
            "C1orf159", "C1orf159", "C1orf159",
            "C1orf159"
        ),
        GeneStrand = c(
            "-", "+", "+", "-", "-", "-", "-", "+",
            "+", "-", "-", "-", "-", "-", "-", "-",
            "-", "+", "+", "+", "-", "-", "-", "-",
            "-", "-", "+", "+", "+", "-", "-", "-",
            "-", "-", "-", "-", "-", "-"
        ),
        TimePoint = c(
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01", "01", "01", "01", "01",
            "01", "01", "01"
        ),
        CellMarker = c(
            "CD34", "CD13", "CD34", "CD13", "CD34",
            "CD13", "CD34", "CD13", "CD34", "CD13",
            "CD34", "CD34", "CD13", "CD34", "CD14",
            "CD19", "CD3", "CD14", "CD19", "CD3",
            "CD14", "CD19", "CD3", "CD14", "CD19",
            "CD3", "CD14", "CD19", "CD3", "CD14",
            "CD19", "CD3", "CD14", "CD19", "CD3",
            "CD14", "CD19", "CD3"
        ),
        Tissue = c(
            "BM", "BM", "BM", "BM", "BM", "BM", "BM",
            "BM", "BM", "BM", "BM", "BM", "BM", "BM",
            "PB", "PB", "PB", "PB", "PB", "PB", "PB",
            "PB", "PB", "PB", "PB", "PB", "PB", "PB",
            "PB", "PB", "PB", "PB", "PB", "PB", "PB",
            "PB", "PB", "PB"
        ),
        Value = c(
            1000, 1, 1, 1, 1, 1000, 300, 1, 10, 10, 8,
            1000, 3, 3, 1, 1, 1, 1000, 1, 1, 1000, 500,
            1, 500, 1, 1, 1, 10, 10, 12, 14, 9, 30, 30,
            30, 1000, 90, 90
        )
    )
    expect_equal(purity_filtered, expected)
    ## DF
    purity_filtered <- purity_filter(
        x = df,
        aggregation_key = c(
            "CellMarker",
            "Tissue",
            "TimePoint"
        ),
        group_key = c("CellMarker", "Tissue"),
        selected_groups = tibble::tribble(
            ~CellMarker, ~Tissue,
            "CD34", "BM",
            "CD13", "BM"
        ),
        min_value = 0,
        impurity_threshold = 10,
        by_timepoint = TRUE,
        value_column = "Value"
    )
    expect_equal(purity_filtered, expected)
})
calabrialab/ISAnalytics documentation built on Nov. 2, 2023, 8:57 p.m.