Biostrings: Efficient manipulation of biological strings

## matchPDict.R exports the following:
##        - matchPDict
##        - countPDict
##        - whichPDict
##        - vmatchPDict
##        - vcountPDict
##        - vwhichPDict
## These are all S4s defined for "subject", XString,
##    XStringSet, XStringViews, MaskedXString

test_that("general match/count/whichPDict() behavior is as expected", {
    d1 <- DNAString("ATGGATACACAA")
    all_codons <- mkAllStrings(DNA_BASES, 3L)

    ## non-pdict with matchPDict
    p1 <- matchPDict(DNAStringSet("ATA"), d1)
    expect_equal(as.integer(c(start(p1), end(p1))), c(5,7))

    pd <- PDict(all_codons)

    ## pd against a DNAStringSet
    p2 <- lengths(matchPDict(pd, d1))
    v1 <- Views(d1, start=seq_len(length(d1)-2L), width=3)
    pos_good <- which(all_codons %in% as.character(v1))
    expect_true(all(p2[pos_good] > 0) && all(p2[-pos_good] == 0))
    expect_equal(whichPDict(pd, d1), pos_good)
    expect_equal(countPDict(pd, d1), p2)

    ## pd against an XStringViews object
    p3 <- lengths(matchPDict(pd, codons(d1)))
    pos_good <- which(all_codons %in% as.character(codons(d1)))
    expect_true(all(p3[pos_good] > 0) && all(p3[-pos_good] == 0))
    expect_equal(whichPDict(pd, codons(d1)), pos_good)
    expect_equal(countPDict(pd, codons(d1)), p3)

    ## pd against a MaskedXString object
    m1 <- Mask(length(d1), start=4, end=8)
    d2 <- d1
    masks(d2) <- m1
    p4 <- lengths(matchPDict(pd, d2))
    v2 <- Views(unmasked(d2), start=c(1,9,10), width=3)
    pos_good <- which(all_codons %in% as.character(v2))
    expect_true(all(p4[pos_good] > 0) && all(p4[-pos_good] == 0))
    expect_equal(whichPDict(pd, d2), pos_good)
    expect_equal(countPDict(pd, d2), p4)

    ## matchPDict() with AA, RNA, B
    aa1 <- AAStringSet(c("DARC", "EGH"))
    aa2 <- AAString("KMFPRNDEGHSTTWTEE")
    paa <- matchPDict(aa1, aa2)
    expect_equal(c(unlist(startIndex(paa)), unlist(endIndex(paa))), c(8,10))
    expect_equal(countPDict(aa1, aa2), c(0,1))
    expect_equal(whichPDict(aa1, aa2), 2)

    b1 <- BStringSet(c("DARC", "EGH"))
    b2 <- BString("KMFPRNDEGHSTTWTEE")
    pb <- matchPDict(b1, b2)
    expect_equal(c(unlist(startIndex(pb)), unlist(endIndex(pb))), c(8,10))
    expect_equal(countPDict(b1, b2), c(0,1))
    expect_equal(whichPDict(b1, b2), 2)
})

test_that("inexact PDict matching works", {
    pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2)
    d <- DNAString("acggaccg")
    expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=0))),
                 c(0L,0L,0L,2L))
    expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=1))),
                 c(1L,0L,2L,2L))
    expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=2))),
                 c(2L,0L,2L,2L))

    expect_equal(countPDict(pdict, d, max.mismatch=0), c(0,0,0,2))
    expect_equal(countPDict(pdict, d, max.mismatch=1), c(1,0,2,2))
    expect_equal(countPDict(pdict, d, max.mismatch=2), c(2,0,2,2))
})

test_that("vcount/vwhich/vmatchPDict() work", {
    ## canary tests for future changes
    pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2)
    d <- DNAString("acggaccg")
    dss <- DNAStringSet(list(d,d))
    dvv <- Views(d, start=rep(1,2), width=rep(length(d), 2))

    # Disallowed cases
    expect_error(matchPDict(pdict, dss), "please use vmatchPDict")
    expect_error(vmatchPDict(pdict, d), "please use matchPDict")
    expect_error(vmatchPDict(pdict, as(d, "MaskedDNAString")),
                 "please use matchPDict")
    expect_error(vwhichPDict(pdict, d), "please use whichPDict")
    expect_error(vwhichPDict(pdict, as(d, "MaskedDNAString")),
                 "please use whichPDict")
    expect_error(vcountPDict(pdict, d), "please use countPDict")
    expect_error(vcountPDict(pdict, as(d, "MaskedDNAString")),
                 "please use countPDict")

    exp_out <- matrix(rep(c(0,2), times=c(6,2)), nrow=4L, byrow=TRUE)
    expect_equal(vcountPDict(pdict, dss), exp_out)
    expect_equal(vwhichPDict(pdict, dss), list(4,4))
    expect_equal(vcountPDict(pdict, dvv), exp_out)
    expect_equal(vwhichPDict(pdict, dvv), list(4,4))

    exp_out[1,] <- 1
    exp_out[3,] <- 2
    expect_equal(vcountPDict(pdict, dss, max.mismatch=1), exp_out)
    expect_equal(vwhichPDict(pdict, dss, max.mismatch=1),
                 list(c(1,3,4),c(1,3,4)))
    expect_equal(vcountPDict(pdict, dvv, max.mismatch=1), exp_out)
    expect_equal(vwhichPDict(pdict, dvv, max.mismatch=1),
                 list(c(1,3,4),c(1,3,4)))

    exp_out[1,] <- 2
    expect_equal(vcountPDict(pdict, dss, max.mismatch=2), exp_out)
    expect_equal(vwhichPDict(pdict, dss, max.mismatch=2),
                 list(c(1,3,4),c(1,3,4)))
    expect_equal(vcountPDict(pdict, dvv, max.mismatch=2), exp_out)
    expect_equal(vwhichPDict(pdict, dvv, max.mismatch=2),
                 list(c(1,3,4),c(1,3,4)))

    ## this will fail when we implement it to remind me to add tests for it
    expect_error(vmatchPDict(pdict, dss),
                 "vmatchPDict() is not ready yet", fixed=TRUE)
})

test_that("MIndex functionality works", {
    ## set up an MIndex object
    s <- mkAllStrings(c("A", "T"), 2L)
    names(s) <- s
    pd <- PDict(s)
    d <- DNAString("ATTATTAATTAA")
    m <- matchPDict(pd, d)
    exp_match_starts <- list(c(7,11), c(1,4,8), c(3,6,10), c(2,5,9))
    expect_equal(length(m), 2^2)
    expect_equal(names(m), names(s))
    expect_equal(startIndex(m), exp_match_starts)
    expect_equal(endIndex(m), lapply(exp_match_starts, \(x) x+1L))
    expect_equal(elementNROWS(m), c(2,3,3,3))
    expect_equal(m[[1L]], IRanges(start=c(7,11), end=c(8,12)))
    expect_s4_class(as(m, "CompressedIRangesList"), "CompressedIRangesList")
    expect_equal(start(unlist(m)), unlist(exp_match_starts))
    ux <- extractAllMatches(d, m)
    exp_ux <- rep(s, times=c(2,3,3,3))
    expect_equal(start(ux), unlist(exp_match_starts))
    expect_equal(as.character(ux), exp_ux)
})

### All old tests
## Helper functions ported from old tests
randomDNASequences <- function(n, w)
{
    alphabet <- DNA_BASES
    w <- rep(w, length=n)
    sequences <- sapply(seq(1, n, length=n),
                        function(x) {
                            s <- sample(alphabet, w[x], replace=TRUE)
                            paste(s, collapse="")
                        })
    Biostrings::DNAStringSet(sequences)
}

msubseq <- function(x, ir)
{
    ## differs from subseq in the sense that several subsequences
    ## from the same sequence are extracted
    ## x:  XString
    ## ir: IRanges
    res <- vector("character", length = length(ir))
    for (i in seq_along(res)) {
        res[i] <- as.character(subseq(x, start=ir@start[i], width=width(ir)[i]))
        ## forced cast: chain of tools for DNAString seems interupted for
        ##              some use cases (or I missed something)
    }
    DNAStringSet(res)
}

test_that("matchPDict() works for constant width dictionary", {
    set.seed(1)
    l <- 150
    dna_target <- randomDNASequences(1, l)[[1]]
    W <- 20
    L <- 6
    ir <- successiveIRanges(rep(W, L), gapwidth = 1)
    short_sequences <- msubseq(dna_target, ir)
    # shuffle the sequences (so they are not in consecutive order)
    o <- sample(seq_along(short_sequences))

    dna_short <- DNAStringSet(short_sequences[o])
    pdict <- PDict(dna_short)

    res <- matchPDict(pdict, dna_target)

    # mostly a sanity check
    expect_equal(L, length(res))

    for (i in seq_along(res)) {
        m_start <- ir[o][i]@start
        expect_equal(m_start, start(res[[i]]))
        expect_equal(W, width(res[[i]]))
        expect_equal(m_start + W - 1, end(res[[i]]))  # mostly a sanity check
    }
})

test_that("matchPDict() works for variable width dictionary", {
    set.seed(1)
    l <- 150
    dna_target <- randomDNASequences(1, l)[[1]]
    W <- 20
    L <- 6
    n_cut <- sample(0:5, L, replace=TRUE)
    ir <- successiveIRanges(rep(W, L) - n_cut,
                            gapwidth = 1 + n_cut[-length(n_cut)])
    short_sequences <- msubseq(dna_target, ir)
    # shuffle the sequences (they are not in consecutive order)
    o <- sample(seq_along(short_sequences))

    dna_var_short <- DNAStringSet(short_sequences[o])

    pdict <- PDict(dna_var_short,
                   tb.start=1,                          # can't these be
                   tb.width=min(width(dna_var_short)))  # the default for
                                                        # variable width ?

    res <- matchPDict(pdict, dna_target)

    # mostly a sanity check
    expect_equal(L, length(res))

    iro <- ir[o]
    for (i in seq_along(res)) {
        expect_equal(start(iro)[i], start(res[[i]]))
        expect_equal(width(iro)[i], width(res[[i]]))
        expect_equal(end(iro)[i], end(res[[i]]))  # mostly a sanity check
    }
})

Bioconductor/Biostrings documentation built on Nov. 11, 2024, 12:58 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-matchPDict.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings Efficient manipulation of biological strings

tests/testthat/test-matchPDict.R In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-matchPDict.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings