Biostrings: Efficient manipulation of biological strings

## letterFrequency.R exports the following:
##    - alphabetFrequency
##    - hasOnlyBaseLetters
##    - uniqueLetters
##    - letterFrequency
##    - *letterFrequencyInSlidingView
##    - consensusMatrix
##    - consensusString (matrix, DNAStringSet, RNAStringSet)
##  - mkAllStrings
##  - oligonucleotideFrequency
##    - dinucleotideFrequency
##    - trinucleotideFrequency
##     - *oligonucleotideTransitions
##  - *twoWayAlphabetFrequency
##
## starred functions are still lacking tests

B_ALPHABET <- strsplit(rawToChar(as.raw(32:126)), "")[[1]]

check_uniqueLetters <- function(XStringFUN, X_ALPH) {
    set.seed(123L)
    s <- sample(X_ALPH, 100L, replace=TRUE)
    u_v <- unique(s)

    s <- paste(s, collapse="")
    expect_equal(sort(uniqueLetters(XStringFUN(s))), sort(u_v))
}

check_alphfreq <- function(XStringFUN, X_ALPH,
                           isMatrix=FALSE, checkBaseOnly=NULL)
{
    set.seed(456L)
    tab <- integer(length(X_ALPH))
    names(tab) <- X_ALPH

    if (isMatrix) {
        exp_res <- matrix(0L, nrow=5, ncol=length(tab))
        colnames(exp_res) <- names(tab)
        strs <- character(5L)
        for (i in seq_len(5)) {
            tab[] <- 0L
            s <- sample(X_ALPH, 100L, replace=TRUE)
            u_v <- table(s)
            tab[names(u_v)] <- u_v
            exp_res[i,] <- tab
            strs[i] <- paste(s, collapse="")
        }
        norm_res <- exp_res / rowSums(exp_res)
        ss <- XStringFUN(strs)
    } else {
        strs <- sample(X_ALPH, 100L, replace=TRUE)
        u_v <- table(strs)
        tab[names(u_v)] <- u_v
        exp_res <- tab
        ss <- XStringFUN(paste(strs, collapse=""))
        norm_res <- exp_res / sum(exp_res)
    }

    # Check alphabetFrequency
    v <- alphabetFrequency(ss)
    expect_equal(v, exp_res)
    expect_equal(sum(v), sum(nchar(strs)))
    expect_equal(alphabetFrequency(ss, as.prob=TRUE), norm_res)

    # Check letterFrequency
    expect_equal(letterFrequency(ss, X_ALPH), exp_res)
    expect_equal(letterFrequency(ss, X_ALPH, as.prob=TRUE), norm_res)

    if (!is.null(checkBaseOnly)) {
        # provide the alphabet
        b <- checkBaseOnly
        if (isMatrix) {
            p_in <- match(colnames(exp_res), b, nomatch=0)
            exp_res <- cbind(exp_res[ , p_in != 0],
                             other=rowSums(exp_res[,p_in==0]))
            norm_res <- exp_res / rowSums(exp_res)
        } else {
            p_in <- match(names(exp_res), b, nomatch=0)
            exp_res <- c(exp_res[p_in!=0], sum(exp_res[p_in==0]))
            names(exp_res) <- c(b, "other")
            norm_res <- exp_res / sum(exp_res)
        }

        v <- alphabetFrequency(ss, baseOnly=TRUE)
        expect_equal(v, exp_res)
        expect_equal(sum(v), sum(nchar(strs)))
        expect_error(alphabetFrequency(ss, baseOnly=NA),
                     "'baseOnly' must be TRUE or FALSE")
        expect_error(alphabetFrequency(ss, baseOnly=1),
                     "'baseOnly' must be TRUE or FALSE")
        expect_equal(alphabetFrequency(ss, baseOnly=TRUE, as.prob=TRUE),
                     norm_res)
        if (isMatrix) {
            expect_equal(letterFrequency(ss, b), exp_res[ , seq_along(b)])
            expect_equal(letterFrequency(ss, b, as.prob=TRUE),
                         norm_res[ , seq_along(b)])
        } else {
            expect_equal(letterFrequency(ss, b), exp_res[seq_along(b)])
            expect_equal(letterFrequency(ss, b, as.prob=TRUE),
                         norm_res[seq_along(b)])
        }
    }
}

test_that("uniqueLetters() works as expected", {
    check_uniqueLetters(DNAString, DNA_ALPHABET)
    check_uniqueLetters(RNAString, RNA_ALPHABET)
    check_uniqueLetters(AAString, AA_ALPHABET)
    check_uniqueLetters(BString, B_ALPHABET)

    check_uniqueLetters(DNAStringSet, DNA_ALPHABET)
    check_uniqueLetters(RNAStringSet, RNA_ALPHABET)
    check_uniqueLetters(AAStringSet, AA_ALPHABET)
    check_uniqueLetters(BStringSet, B_ALPHABET)
})

test_that("letterFrequency(), alphabetFrequency() work for all input types", {
    check_alphfreq(DNAString, DNA_ALPHABET, checkBaseOnly=DNA_BASES)
    check_alphfreq(RNAString, RNA_ALPHABET, checkBaseOnly=RNA_BASES)
    check_alphfreq(AAString, AA_ALPHABET, checkBaseOnly=AA_STANDARD)

    check_alphfreq(DNAStringSet, DNA_ALPHABET,
                   isMatrix=TRUE, checkBaseOnly=DNA_BASES)
    check_alphfreq(RNAStringSet, RNA_ALPHABET,
                   isMatrix=TRUE, checkBaseOnly=RNA_BASES)
    check_alphfreq(AAStringSet, AA_ALPHABET,
                   isMatrix=TRUE, checkBaseOnly=AA_STANDARD)

    ## checking for ambiguity tabulation
    d <- DNAString("ATGCATGC.+Y")
    exp_output <- c(2L, 4L, 2L)
    names(exp_output) <- c("A", "G|C", "T")
    expect_equal(letterFrequency(d, letters=c("A", "GC", "T")), exp_output)
    names(exp_output)[2] <- "G*C"
    expect_equal(letterFrequency(d, letters=c("A", "GC", "T"), OR="*"),
                 exp_output)

    r <- RNAString("AUGCAUGC.+Y")
    exp_output <- 8L
    names(exp_output) <- c("A|U|G|C")
    expect_equal(letterFrequency(r, letters="AUGC"), exp_output)
    exp_output <- rep(2L, 4)
    names(exp_output) <- c("A","U","G","C") # intentionally different
                                            # from RNA_BASES
    expect_equal(letterFrequency(r, letters="AUGC", OR=0), exp_output)

    ## BStrings have their own check
    set.seed(999L)
    bstr <- sample(B_ALPHABET, 100, replace=TRUE)
    bstr_counts <- table(match(bstr, B_ALPHABET))
    r <- integer(256L)
    r[as.integer(names(bstr_counts)) + 32L] <- bstr_counts
    bstr <- paste(bstr, collapse="")
    expect_equal(alphabetFrequency(BString(bstr)), r)
    expect_equal(alphabetFrequency(BStringSet(bstr)),
                 matrix(r, nrow=1L, ncol=256L))

    expect_error(alphabetFrequency(BString(), baseOnly=TRUE),
                 "unused argument (baseOnly = TRUE)", fixed=TRUE)

    ## checking for Views objects and maskedString objects
    ds <- sample(DNA_ALPHABET, 100L, replace=TRUE)
    exp_out <- matrix(0L, nrow=2, ncol=length(DNA_ALPHABET))
    colnames(exp_out) <- DNA_ALPHABET
    exp_out[1,names(table(ds[seq(1,50)]))] <- table(ds[seq(1,50)])
    exp_out[2,names(table(ds[seq(51,100)]))] <- table(ds[seq(51,100)])
    v <- Views(DNAString(paste(ds, collapse="")), start=c(1,51), end=c(50,100))
    expect_equal(alphabetFrequency(v), exp_out)
    expect_equal(letterFrequency(v, "A"), exp_out[,"A",drop=FALSE])

    mask_d <- DNAString(paste(ds, collapse=""))
    masks(mask_d) <- Mask(length(ds), start=c(5,50), end=c(10,60))
    exp_out <- rep(0L, length(DNA_ALPHABET))
    names(exp_out) <- DNA_ALPHABET
    subdstr <- ds[-c(5:10,50:60)]
    exp_out[names(table(subdstr))] <- table(subdstr)
    expect_equal(alphabetFrequency(mask_d), exp_out)
    expect_equal(letterFrequency(mask_d, "A"), exp_out["A"])
})

test_that("hasOnlyBaseLetters() works as expected", {
    ## XStringSet
    expect_true(hasOnlyBaseLetters(DNAStringSet(DNA_BASES)))
    expect_true(hasOnlyBaseLetters(RNAStringSet(RNA_BASES)))
    expect_true(hasOnlyBaseLetters(AAStringSet(AA_STANDARD)))

    expect_false(hasOnlyBaseLetters(DNAStringSet(DNA_ALPHABET)))
    expect_false(hasOnlyBaseLetters(RNAStringSet(RNA_ALPHABET)))
    expect_false(hasOnlyBaseLetters(AAStringSet(AA_ALPHABET)))
    expect_error(hasOnlyBaseLetters(BStringSet()),
                 "unable to find an inherited method")

    ## Views
    d1 <- DNAString(paste(DNA_BASES, collapse=""))
    d2 <- DNAString(paste(DNA_ALPHABET, collapse=""))
    v1 <- Views(d1, start=1, end=length(DNA_BASES))
    expect_true(hasOnlyBaseLetters(v1))
    v2 <- Views(d2, start=1, end=length(DNA_ALPHABET))
    expect_false(hasOnlyBaseLetters(v2))

    ## MaskedXString
    md1 <- d1
    md2 <- d2
    masks(md1) <- Mask(length(d1), start=2, end=3)
    masks(md2) <- Mask(length(d2), start=2, end=5)
    expect_true(hasOnlyBaseLetters(md1))
    expect_false(hasOnlyBaseLetters(md2))

})

test_that("miscellaneous letterFrequency() methods work", {
    ## mkAllStrings
    alph <- c("A", "B", "C")
    exp_output <- expand.grid(alph, alph, alph)
    exp_output <- apply(exp_output, 1L, paste, collapse="")
    expect_equal(sort(mkAllStrings(alph, 3L)), sort(exp_output))

    ## uniqueLetters
    d <- paste(rep(DNA_ALPHABET, each=3), collapse="")
    expect_equal(uniqueLetters(DNAString(d)), DNA_ALPHABET)
    expect_equal(uniqueLetters(DNAStringSet(c(d,d))), DNA_ALPHABET)
    v <- Views(DNAString(d), start=c(1,6), end=c(5,nchar(d)))
    expect_equal(uniqueLetters(v), DNA_ALPHABET)
    md <- DNAString(d)
    masks(md) <- Mask(nchar(d), start=seq(1,nchar(d),3), end=seq(2,nchar(d),3))
    expect_equal(uniqueLetters(md), DNA_ALPHABET)
})

test_that("nucleotideFrequency() methods work properly", {
    set.seed(999L)
    l <- 102L
    ds <- sample(DNA_BASES, l, replace=TRUE)
    d <- DNAString(paste(ds, collapse=""))
    dss <- DNAStringSet(c(ds,ds))

    ## oligonucleotideFrequency(x, 1L) should be the same as
    ## alphabetFrequency(..., baseOnly=TRUE) (ignoring the 'other' arg)
    expect_equal(oligonucleotideFrequency(d, 1L),
                 alphabetFrequency(d, baseOnly=TRUE)[seq_len(4L)])
    expect_equal(oligonucleotideFrequency(dss, 1L),
                 alphabetFrequency(dss, baseOnly=TRUE)[ , seq_len(4L)])

    ## dinucleotide correctness
    all_doubles <- table(paste0(ds[seq_len(l-1L)], ds[seq_len(l-1L)+1L]))
    exp_res <- rep(0L, length(DNA_BASES)**2)
    names(exp_res) <- mkAllStrings(DNA_BASES, 2L)
    exp_res[names(all_doubles)] <- all_doubles
    expect_equal(oligonucleotideFrequency(d, 2L), exp_res)
    expect_equal(dinucleotideFrequency(d), exp_res)

    ## trinucleotide correctness
    all_triples <- table(paste0(ds[seq_len(l-2L)],
                                ds[seq_len(l-2L)+1L],
                                ds[seq_len(l-2L)+2L]))
    exp_res <- rep(0L, length(DNA_BASES)**3)
    names(exp_res) <- mkAllStrings(DNA_BASES, 3L)
    exp_res[names(all_triples)] <- all_triples
    expect_equal(oligonucleotideFrequency(d, 3L), exp_res)
    expect_equal(trinucleotideFrequency(d), exp_res)

    ## using step
    all_codons <- codons(d)
    exp_res[] <- 0L
    codon_table <- table(as.character(all_codons))
    exp_res[names(codon_table)] <- codon_table
    expect_equal(oligonucleotideFrequency(d, 3L, step=3L), exp_res)

    ## frequency on a Views object
    exp_res <- matrix(0L, nrow=l/3, ncol=length(DNA_BASES)**3)
    colnames(exp_res) <- mkAllStrings(DNA_BASES, 3L)
    midx1 <- cbind(seq_len(nrow(exp_res)),
                   match(as.character(all_codons), colnames(exp_res)))
    exp_res[midx1] <- 1L
    expect_equal(oligonucleotideFrequency(codons(d), 3L), exp_res)

    ## frequency on a MaskedXString object
    mask_d <- d
    masks(mask_d) <- Mask(l, start=c(10,50), end=c(19,57))
    mask_str <- strsplit(as.character(mask_d), "")[[1]]
    all_doubles <- table(paste0(mask_str[seq_len(l-1L)],
                                mask_str[seq_len(l-1L) + 1L]))
    all_doubles <- all_doubles[!grepl("#", names(all_doubles))]
    exp_res <- rep(0L, length(DNA_BASES)**2)
    names(exp_res) <- mkAllStrings(DNA_BASES, 2L)
    exp_res[names(all_doubles)] <- all_doubles
    expect_equal(oligonucleotideFrequency(mask_d, 2L), exp_res)

    ## disallowed types
    expect_error(oligonucleotideFrequency(AAString(),
                 "must contain sequences of type DNA or RNA"))
    expect_error(oligonucleotideFrequency(BStringSet(),
                 "must contain sequences of type DNA or RNA"))
})

test_that("consensusMatrix/String() work properly on zero-length input", {
    ## verify previous functionality
    exp_output <- matrix(0L, nrow=5, ncol=4)
    diag(exp_output) <- 1L
    rownames(exp_output) <- c(DNA_BASES, "other")
    expect_identical(consensusMatrix(DNAStringSet("ACGT"), baseOnly=TRUE),
                     exp_output)
    expect_identical(consensusString(DNAStringSet(c("ATGC", "ATTC"))), "ATKC")

    ## DNA_ALPHABET is a subset of AA_ALPHABET
    all_letters <- do.call(union, list(AA_ALPHABET, RNA_ALPHABET))
    exp_zero_output <- matrix(ncol=0, nrow=length(all_letters))
    rownames(exp_zero_output) <- all_letters

    ## having issues testing for matrix equality manually
    expect_equal_matrix <- function(m1, m2) {
        expect_identical(nrow(m1), nrow(m2))
        expect_identical(rownames(m1), rownames(m2))
    }
    expect_equal_matrix(consensusMatrix(DNAStringSet()),
                        exp_zero_output[DNA_ALPHABET,])
    expect_equal_matrix(consensusMatrix(RNAStringSet()),
                        exp_zero_output[RNA_ALPHABET,])
    expect_equal_matrix(consensusMatrix( AAStringSet()),
                        exp_zero_output[AA_ALPHABET,])
    expect_equal_matrix(consensusMatrix(  BStringSet()),
                        exp_zero_output[NULL,])

    ## zero length input for consensusString
    expect_identical(consensusString(DNAStringSet()), character(0L))
    expect_identical(consensusString(RNAStringSet()), character(0L))
    expect_identical(consensusString(AAStringSet()), character(0L))
    expect_identical(consensusString(BStringSet()), character(0L))
    expect_identical(consensusString(character(0L)), character(0L))

    expect_true(is.integer(consensusMatrix(DNAStringSet(), as.prob=FALSE)))
    expect_false(is.integer(consensusMatrix(DNAStringSet(), as.prob=TRUE)))
})
Bioconductor/Biostrings documentation built on June 10, 2025, 1:14 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-letterFrequency.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings Efficient manipulation of biological strings

tests/testthat/test-letterFrequency.R In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-letterFrequency.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings