monaLisa: Binned Motif Enrichment Analysis and Visualization

test_that(".binomEnrichmentTests works", {
    suppressMessages(expect_message(
        res <- .binomEnrichmentTest(
            matchCountBg = c(5, 8), totalWeightBg = 17,
            matchCountFg = c(9, 3), totalWeightFg = 20, verbose = TRUE)
    ))
    expect_equal(res[1],
                 log(binom.test(x = 9, n = 20, p = 5/17,
                                alternative = "greater")$p.value))
    expect_equal(res[2],
                 log(binom.test(x = 3, n = 20, p = 8/17,
                                alternative = "greater")$p.value))
})

test_that(".fisherEnrichmentTest works", {
    suppressMessages(expect_message(
        res <- .fisherEnrichmentTest(
            matchCountBg = c(5, 8), totalWeightBg = 17,
            matchCountFg = c(9, 3), totalWeightFg = 20, verbose = TRUE)
    ))
    expect_equal(res[1],
                 log(fisher.test(x = matrix(c(9, 20 - 9, 5, 17 - 5), nrow = 2),
                                 alternative = "greater")$p.value))
    expect_equal(res[2],
                 log(fisher.test(x = matrix(c(3, 20 - 3, 8, 17 - 8), nrow = 2),
                                 alternative = "greater")$p.value))
})

test_that(".calcPearsonResiduals works", {
    res <- .calcPearsonResiduals(
        matchCountBg = c(5, 8), totalWeightBg = 17,
        matchCountFg = c(9, 3), totalWeightFg = 20)
    expect_equal(res[1],
                 (9 - (9 + 5)/(20 + 17) * 20)/sqrt((9 + 5)/(20 + 17) * 20 *
                                                       (1 - 20/(20 + 17)) *
                                                       (1 - (5 + 9)/(20 + 17))))
    expect_equal(res[2],
                 (3 - (3 + 8)/(20 + 17) * 20)/sqrt((3 + 8)/(20 + 17) * 20 *
                                                       (1 - 20/(20 + 17)) *
                                                       (1 - (8 + 3)/(20 + 17))))

})

test_that(".calcExpFg works", {
    res <- .calcExpFg(
        matchCountBg = c(5, 8), totalWeightBg = 17,
        matchCountFg = c(9, 3), totalWeightFg = 20)
    expect_equal(res[1], (9 + 5)/(20 + 17) * 20)
    expect_equal(res[2], (3 + 8)/(20 + 17) * 20)
})

test_that(".calcLog2Enr works", {
    res <- .calcLog2Enr(
        matchCountBg = c(5, 8), totalWeightBg = 17,
        matchCountFg = c(9, 3), totalWeightFg = 20, pseudocount = 3)
    expect_equal(res[1], log2((9/20 * 17 + 3)/(5/17 * 17 + 3)))
    expect_equal(res[2], log2((3/20 * 17 + 3)/(8/17 * 17 + 3)))
})

test_that(".checkDfValidity() works", {
    seqchar <- c(s1 = "GTGCATGCAT", s2 = "ACGTACGTAC")
    df <- DataFrame(seqs = DNAStringSet(seqchar),
                    isForeground = c(TRUE, FALSE),
                    GCfrac = NA_real_,
                    GCbin = NA_integer_,
                    GCwgt = NA_real_,
                    seqWgt = NA_real_)

    expect_error(.checkDfValidity("error"), "should be a .*DataFrame")
    expect_error(.checkDfValidity(df[1:3]), "has to have columns")
    expect_error(.checkDfValidity(DataFrame(seqs = seqchar, df[2:6])), "expected types")
    expect_error(.checkDfValidity(DataFrame(df[1], DataFrame(isForeground = 1:2), df[3:6])), "expected types")
    expect_error(.checkDfValidity(DataFrame(df[1:2], DataFrame(GCfrac = seqchar), df[4:6])), "expected types")
    expect_error(.checkDfValidity(DataFrame(df[1:3], DataFrame(GCbin = seqchar), df[5:6])), "expected types")
    expect_error(.checkDfValidity(DataFrame(df[1:4], DataFrame(GCwgt = seqchar), df[6])), "expected types")
    expect_error(.checkDfValidity(DataFrame(df[1:5], DataFrame(seqWgt = seqchar))), "expected types")
    expect_error(.checkDfValidity(df), "attribute.*: err")
})


test_that(".filterSeqs() works", {
    seqs <- DNAStringSet(c(s1 = "GTGCATGCATACCA", s2 = "ACGNNNGTAC", s3 = "AAA"))

    expect_error(.filterSeqs("error"), "DNAStringSet")
    expect_error(.filterSeqs(seqs, maxFracN = "error"), "numeric")
    expect_error(.filterSeqs(seqs, minLength = -1), "between 0 and Inf")
    expect_error(.filterSeqs(seqs, maxLength = 0), "between 5 and Inf")
    expect_error(.filterSeqs(seqs, verbose = "error"), "logical")
    suppressMessages(
        expect_message(.filterSeqs(seqs, maxFracN = 0.1, verbose = TRUE))
    )

    expect_identical(.filterSeqs(seqs), c(TRUE, TRUE, FALSE))
    expect_identical(.filterSeqs(seqs, maxFracN = 0.1, verbose = FALSE), c(TRUE, FALSE, FALSE))
    expect_identical(.filterSeqs(seqs, minLength = 12, verbose = FALSE), c(TRUE, FALSE, FALSE))
    expect_identical(.filterSeqs(seqs, maxLength = 10, verbose = FALSE), c(FALSE, TRUE, FALSE))
})


test_that(".defineBackground() works", {
    set.seed(1)
    seqs <- DNAStringSet(unlist(lapply(1:90,
                                       function(i) paste(sample(c("A","C","G","T"),
                                                                20, replace = TRUE,
                                                                prob = c(.2,.3,.3,.2)),
                                                         collapse = ""))))
    names(seqs) <- paste0("s", seq_along(seqs))
    b1 <- factor(rep(1:3, each = 30))
    b1 <- setZeroBin(b1, 2)
    b2 <- factor(rep(1:2, each = 45))
    b2 <- setZeroBin(b2, 2)
    gnm <- DNAStringSet(unlist(lapply(1:10,
                                      function(i) paste(sample(c("A","C","G","T"),
                                                               1000 - i * 10,
                                                               replace = TRUE),
                                                        collapse = ""))))
    names(gnm) <- paste0("g", seq_along(gnm))
    gnm2 <- DNAStringSet(unlist(lapply(1:10,
                                       function(i) paste(sample(c("A","C","G","T"),
                                                                1000 - i * 10,
                                                                replace = TRUE,
                                                                prob = c(.4, .1, .1, .4)),
                                                         collapse = ""))))
    names(gnm2) <- paste0("g", seq_along(gnm2))

    df1 <- .defineBackground(seqs, b1, "otherBins", 1, NULL, NULL, 2, 0.7)
    df2 <- .defineBackground(seqs, b1, "allBins",   1, NULL, NULL, 2, 0.7)
    df3 <- .defineBackground(seqs, b1, "zeroBin",   1, NULL, NULL, 2, 0.7)
    set.seed(123L)
    df4 <- .defineBackground(seqs, b1, "genome",    1, gnm,  NULL, 2, 0.7)

    df5 <- .defineBackground(seqs, b2, "otherBins", 1, NULL, NULL, 2, 0.7)
    df6 <- .defineBackground(seqs, b2, "zeroBin",   1, NULL, NULL, 2, 0.7)

    set.seed(42L)
    df7 <- .defineBackground(seqs, b1, "genome",    1, gnm,
                             GenomicRanges::GRanges("g3", IRanges::IRanges(1, 970)),
                             2, 0.7)
    expect_warning(df8 <- .defineBackground(seqs, b1, "genome", 1,
                                            gnm2, NULL, 2, 0.7))

    expect_s4_class(df1, "DataFrame")
    expect_s4_class(df2, "DataFrame")
    expect_s4_class(df3, "DataFrame")
    expect_s4_class(df4, "DataFrame")
    expect_s4_class(df5, "DataFrame")
    expect_s4_class(df6, "DataFrame")
    expect_s4_class(df7, "DataFrame")
    expect_s4_class(df8, "DataFrame")

    expect_identical(attr(df1, "err"), NA)

    expect_identical(dim(df1), c( 90L, 6L))
    expect_identical(dim(df2), c(120L, 6L))
    expect_identical(dim(df3), c( 60L, 6L))
    expect_identical(dim(df4), c( 60L, 6L))
    expect_identical(dim(df5), c( 90L, 6L))
    expect_identical(dim(df6), c( 90L, 6L))
    expect_identical(dim(df7), c( 60L, 6L))
    expect_identical(dim(df8), c( 60L, 6L))

    expect_identical(df5, df6)

    expect_identical(unlist(lapply(as.character(df7$seqs[!df7$isForeground]),
                                   function(gs) {
                                       grep(gs, as.character(gnm))
                                   }), use.names = FALSE), rep(3L, 30))

    expect_true(.checkDfValidity(df1))
    expect_true(.checkDfValidity(df2))
    expect_true(.checkDfValidity(df3))
    expect_true(.checkDfValidity(df4))
    expect_true(.checkDfValidity(df5))
    expect_true(.checkDfValidity(df6))
    expect_true(.checkDfValidity(df7))

    expect_identical(sum(df1$isForeground), 30L)
    expect_identical(sum(df2$isForeground), 30L)
    expect_identical(sum(df3$isForeground), 30L)
    expect_identical(sum(df4$isForeground), 30L)
    expect_identical(sum(df5$isForeground), 45L)
    expect_identical(sum(df6$isForeground), 45L)
    expect_identical(sum(df7$isForeground), 30L)

    expect_identical(rownames(df1), names(seqs))
    expect_identical(rownames(df2), names(seqs)[c(1:30, 1:90)])
    expect_identical(rownames(df3), names(seqs)[c(1:60)])
    expect_identical(rownames(df4)[1:30], names(seqs)[1:30])
    expect_true(all(grepl("^g[0-9]+$", rownames(df4)[31:60])))
    expect_identical(rownames(df5), names(seqs))
    expect_identical(rownames(df6), names(seqs))
    expect_identical(rownames(df7)[1:30], names(seqs)[1:30])
    expect_true(all(grepl("^g[0-9]+$", rownames(df7)[31:60])))

    # for background = "genome", does the sampling make the G+C distribution more similar?
    gnm.tiles <- BSgenome::getSeq(gnm,
                                  unlist(GenomicRanges::tileGenome(GenomeInfoDb::seqlengths(gnm),
                                                                   tilewidth = 20)))
    set.seed(42L)
    df4b <- .defineBackground(gnm.tiles, factor(rep(1, length(gnm.tiles))), "otherBins", 1,
                              NULL, NULL, NULL, 0.7)
    GCbreaks = c(0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8)
    gnm.gcbin.tab <- unclass(tabulate(findInterval(df4b$GCfrac, GCbreaks, all.inside = TRUE), 9))
    bg.gcbin.tab  <- unclass(tabulate(findInterval(df4$GCfrac[!df4$isForeground], GCbreaks, all.inside = TRUE), 9))
    fg.gcbin.tab  <- unclass(tabulate(findInterval(df4$GCfrac[ df4$isForeground], GCbreaks, all.inside = TRUE), 9))
    expect_true(cor(fg.gcbin.tab, bg.gcbin.tab) > cor(fg.gcbin.tab, gnm.gcbin.tab))
})


test_that(".calculateGCweight() works", {
    fseq <- c(f1  = "AGGGGGGGGG", f2  = "AAGGGGGGGG", f3  = "AAAGGGGGGG",
              f4  = "AAAAGGGGGG",                     f6  = "AAAAAAGGGG",
              f7  = "AAAAAAAGGG", f8  = "AAAAAAAAGG", f9  = "AAAAAAAAAG")
    bseq <- c(b1  = "AGGGGGGGGG", b2  = "AAGGGGGGGG", b3  = "AAAGGGGGGG",
              b4  = "AAAAGGGGGG", b5  = "AAAAAGGGGG", b6  = "AAAAAAGGGG",
              b7  = "AAAAAAAGGG", b8  = "AAAAAAAAGG", b9  = "AAAAAAAAAG",
              b2b = "AAGGGGGGGG", b4b = "AAAAGGGGGG", b6b = "AAAAAAGGGG")
    df <- DataFrame(seqs = DNAStringSet(c(fseq, bseq)),
                    isForeground = rep(c(TRUE, FALSE), c(length(fseq), length(bseq))),
                    GCfrac = NA_real_,
                    GCbin = NA_integer_,
                    GCwgt = NA_real_,
                    seqWgt = NA_real_)
    attr(df, "err") <- 0

    expect_error(.calculateGCweight("error"), "should be a .*DataFrame")
    expect_error(.calculateGCweight(df, GCbreaks = "error"), "numeric")
    expect_error(.calculateGCweight(df, GCbreaks = 0.2), "length 2 or greater")
    expect_error(.calculateGCweight(df, verbose = "error"), "logical")
    suppressMessages(expect_message(.calculateGCweight(df, verbose = TRUE)))

    expect_s4_class(res1 <- .calculateGCweight(df, verbose = FALSE), "DataFrame")
    expect_identical(df[-13, 1:2], res1[, 1:2])
    expect_identical(res1$GCfrac, c(9:6,4:1,9:6,4:1,8,6,4) / 10)
    expect_identical(res1[res1$isForeground, "GCwgt"],
                     rep(1.0, sum(res1$isForeground)))
    expect_identical(res1[!res1$isForeground, "GCwgt"],
                     rep(c(1.03125, 0.68750, 1.37500, 1.03125, 0.68750), c(3, 2, 3, 1, 2)))
})


test_that(".normForKmers() works", {
    set.seed(123)
    len <- 50
    seqschar <- unlist(lapply(seq.int(150), function(i) {
        paste(sample(c("A","C","G","T","N"), len, replace = TRUE, prob = c(.24, .24, .24, .24, .04)), collapse = "")
    }))
    seqs <- DNAStringSet(seqschar)
    kmerfreq <- lapply(1:3, function(k) oligonucleotideFrequency(seqs, width = k))
    gkmers <- lapply(kmerfreq, rowSums)
    kmerfreq <- lapply(1:3, function(k) kmerfreq[[k]] / gkmers[[k]])
    kmerseqrc <- lapply(kmerfreq, function(x) as.character(reverseComplement(DNAStringSet(colnames(x)))))
    seqwgt <- c(rep(1, 75), rnorm(75, mean = 1, sd = 0.1))
    isfg <- rep(c(TRUE, FALSE), each = 75)

    res1 <- .normForKmers(kmerfreq, gkmers, kmerseqrc, seqwgt, isfg)

    expect_type(res1, "list")
    expect_length(res1, 2L)
    expect_identical(names(res1), c("seqWgt", "err"))
    expect_identical(round(res1$seqWgt[!isfg], 3),
                     c(0.856, 1.085, 1.06, 0.965, 1.056, 1.011, 1.046, 0.831, 1.204,
                       0.937, 0.787, 1.101, 0.839, 1.017, 0.882, 0.844, 0.964, 1.04,
                       0.979, 0.923, 0.967, 1.028, 0.955, 0.92, 1.081, 0.87, 1.064,
                       0.935, 0.955, 1.141, 0.94, 0.88, 0.873, 1.041, 0.812, 1.023,
                       0.934, 0.843, 0.986, 1.107, 0.806, 0.876, 1.095, 0.895, 0.866,
                       0.862, 0.95, 0.987, 0.932, 1.042, 1.049, 1.027, 1.078, 0.98,
                       0.986, 1.065, 0.994, 1.079, 0.985, 1.023, 1.039, 0.882, 1.029,
                       0.972, 1.178, 0.931, 1.077, 0.767, 0.923, 0.955, 1.113, 0.949,
                       0.972, 1.118, 0.989))
})


test_that(".iterativeNormForKmers() works", {
    fseq <- c(f1  = "AGGGGGGGGG", f2  = "AAGGGGGGGG", f3  = "AAAGGGGGGG",
              f4  = "AAAAGGGGGG",                     f6  = "AAAAAAGGGG",
              f7  = "AAAAAAAGGG", f8  = "AAAAAAAAGG", f9  = "AAAAAAAAAG")
    bseq <- c(b1  = "AGGGGGGGGG", b2  = "AAGGGGGGGG", b3  = "AAAGGGGGGG",
              b4  = "AAAAGGGGGG", b5  = "AAAAAGGGGG", b6  = "AAAAAAGGGG",
              b7  = "AAAAAAAGGG", b8  = "AAAAAAAAGG", b9  = "AAAAAAAAAG",
              b2b = "AAGGGGGGGG", b4b = "AAAAGGGGGG", b6b = "AAAAAAGGGG")
    df <- DataFrame(seqs = DNAStringSet(c(fseq, bseq)),
                    isForeground = rep(c(TRUE, FALSE), c(length(fseq), length(bseq))),
                    GCfrac = NA_real_,
                    GCbin = NA_integer_,
                    GCwgt = NA_real_,
                    seqWgt = NA_real_)
    attr(df, "err") <- 0
    df <- .calculateGCweight(df)

    expect_error(.iterativeNormForKmers("error"))
    expect_error(.iterativeNormForKmers(df, maxKmerSize = "error"), "integer")
    expect_error(.iterativeNormForKmers(df, minSeqWgt = -1), "between 0 and Inf")
    expect_error(.iterativeNormForKmers(df, maxIter = "error"), "integer")
    expect_error(.iterativeNormForKmers(df, verbose = "error"), "logical")

    suppressMessages(
        expect_message(res0 <- .iterativeNormForKmers(df, maxKmerSize = 2L, verbose = TRUE))
    )
    expect_s4_class(res0, "DataFrame")
    expect_identical(round(attr(res0, "err"), 6), 0.495758)
    expect_identical(dim(df), dim(res0))
    suppressMessages(
        expect_message(res1 <- .iterativeNormForKmers(df, verbose = TRUE))
    )
    expect_s4_class(res1, "DataFrame")
    expect_identical(round(attr(res1, "err"), 6), 0.954395)
    attr(df, "err") <- attr(res1, "err")
    expect_identical(dim(res1), dim(df))
    expect_identical(res1[1:5], df[1:5])
    expect_identical(round(res1[!res1$isForeground, "seqWgt"], 3),
                     c(1.251, 0.887, 0.913, 0.675, 0.7, 1.344, 1.347,
                       1.347, 0.887, 0.675, 0.7))
})


test_that(".checkIfSeqsAreEqualLength works as expected", {
    x1 <- Biostrings::DNAStringSet(c("AAA", "CCC", "GGGGGG"))
    x2 <- GenomicRanges::GRanges("chr1",
                                 IRanges::IRanges(start = c(1, 10, 20),
                                                  width = c(3,  3,  6)))

    expect_warning(.checkIfSeqsAreEqualLength(x = x1),
                   "Not all elements of x1 have the same length")
    expect_warning(.checkIfSeqsAreEqualLength(x = x2),
                   "Not all elements of x2 have the same length")

    expect_true(is.null(.checkIfSeqsAreEqualLength(x = x1[1:2])))
    expect_true(is.null(.checkIfSeqsAreEqualLength(x = x2[1:2])))
})
fmicompbio/monaLisa documentation built on June 15, 2025, 1:50 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
fmicompbio/monaLisa
Binned Motif Enrichment Analysis and Visualization

tests/testthat/test_utils_enrichment.R
In fmicompbio/monaLisa: Binned Motif Enrichment Analysis and Visualization

R Package Documentation

Browse R Packages

We want your feedback!

fmicompbio/monaLisa Binned Motif Enrichment Analysis and Visualization

tests/testthat/test_utils_enrichment.R In fmicompbio/monaLisa: Binned Motif Enrichment Analysis and Visualization

R Package Documentation

Browse R Packages

We want your feedback!

fmicompbio/monaLisa
Binned Motif Enrichment Analysis and Visualization

tests/testthat/test_utils_enrichment.R
In fmicompbio/monaLisa: Binned Motif Enrichment Analysis and Visualization