Biostrings: Efficient manipulation of biological strings

## testing for both replaceAt.R and replaceLetterAt.R
##
## these export the following:
## - extractAt
## - replaceAt
## - replaceLetterAt
## - (.inplaceReplaceLetterAt)
## These functions work like subseq(<-) but without copying data
## replaceLetterAt will make a copy of the sequence.
## .inplaceReplaceLetterAt will not, but its use is discouraged.
## note that replaceLetterAt is only defined for character, DNAString, DNAStringSet

test_that("replaceLetterAt has expected behavior", {
    str <- "AGAGAGAGAGAG"
    d <- DNAString(str)
    dss <- DNAStringSet(list(d,d))
    pos_replace <- c(1, 3, 5, 7, 8)
    mat_replace <- matrix(rep(FALSE, length(d)*2), nrow=2L, byrow=TRUE)
    mat_replace[,pos_replace] <- TRUE
    r_str <- "CGCGCGCCAGAG"
    replace_dss <- DNAStringSet(rep(strrep("C", length(pos_replace)), 2))
    #expect_equal(replaceLetterAt(str, pos_replace, "C"), replaced_str)
    object <- replaceLetterAt(d, pos_replace, rep("C", length(pos_replace)))
    expect_equal(object, DNAString(r_str))
    object <- replaceLetterAt(d, mat_replace[1, ],
                              rep("C", length(pos_replace)))
    expect_equal(object, DNAString(r_str))
    object <- replaceLetterAt(dss, mat_replace, replace_dss)
    expect_equal(object, DNAStringSet(c(r_str, r_str)))

    d2 <- DNAString("RMWSYK")
    expect_equal(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3)),
                 DNAString("AAAMMM"))
    object <- replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3),
                              if.not.extending="merge")
    expect_equal(object, DNAString("RMWVHN"))
    object <- replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3),
                              if.not.extending="skip")
    expect_equal(object, d2)
    expect_error(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3),
                                 if.not.extending="error"),
                 "does not extend old letter")
})

## old tests

### From an XStringSet object
### =========================

### Only compare the classes, the shapes (i.e. lengths + elementNROWS +
### names), the inner names, and the sequence contents. Doesn't look at
### the metadata or the metadata columns (outer or inner).
identical_XStringSetList <- function(target, current) {
    ok1 <- identical(class(target), class(current))
    ok2 <- identical(elementNROWS(target), elementNROWS(current))
    unlisted_target <- unlist(target, use.names=FALSE)
    unlisted_current <- unlist(current, use.names=FALSE)
    ok3 <- identical(names(unlisted_target), names(unlisted_current))
    ok4 <- all(unlisted_target == unlisted_current)
    ok1 && ok2 && ok3 && ok4
}

### Only compare the classes, lengths, names, and sequence contents.
### Doesn't look at the metadata or the metadata columns.
identical_XStringSet <- function(target, current) {
    ok1 <- identical(class(target), class(current))
    ok2 <- identical(names(target), names(current))
    ok3 <- all(target == current)
    ok1 && ok2 && ok3
}

#randomDisjointRanges <- function(min_start, max_end, min_width, max_width, n) {
#    offset <- min_start - 1L
#    n1 <- n * (1.1 + max_width/(max_end-offset+1L))  # very approximative
#    some_starts <- sample(max_end-offset+1L, n1, replace=TRUE) + offset
#    some_widths <- sample(min_width:max_width, n1, replace=TRUE)
#    some_ranges <- IRanges(some_starts, width=some_widths)
#    some_ranges <- some_ranges[end(some_ranges) <= max_end]
#    ans <- disjoin(some_ranges)
#    if (min_width == 0L) {
#        is_empty <- width(some_ranges) == 0L
#        some_empty_ranges <- some_ranges[is_empty]
#        ans <- sample(c(ans, some_empty_ranges))
#    }
#    if (length(ans) < n)
#        stop("internal error, sorry")
#    head(ans, n=n)
#}


test_that("old extractAt tests still work", {
    x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ"))
    at <- IRangesList(IRanges(c(2, 1), c(3, 0)),
                      IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)),
                      IRanges(2, 2))
    ### Set inner names on 'at'.
    unlisted_at <- unlist(at)
    names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at)))
    at <- relist(unlisted_at, at)

    res1a <- extractAt(x, at)
    res1b <- BStringSetList(mapply(extractAt, x, at))
    expect_true(identical_XStringSetList(res1a, res1b))

    res2a <- extractAt(x, at[3])
    res2b <- BStringSetList(mapply(extractAt, x, at[3]))
    expect_true(identical_XStringSetList(res2a, res2b))
    res2c <- extractAt(x, at[[3]])
    expect_true(identical_XStringSetList(res2a, res2c))

    ## performance testing is cut, this is too time consuming
    ### Testing performance with 2 millions small extractions at random
    ### locations in Fly upstream sequences:
    # dm3_upstream_filepath <- system.file("extdata",
    #                                      "dm3_upstream2000.fa.gz",
    #                                      package="Biostrings")
    # dm3_upstream <- readDNAStringSet(dm3_upstream_filepath)
    # dm3_upstream <- dm3_upstream[width(dm3_upstream) == 2000L]

    # set.seed(33)
    # ## cut this down significantly from 2,000,000
    # ## performance is not important for unit testing
    # NE <- 2000000L  # total number of extractions

    # sample_size <- NE * 1.1
    # some_ranges <- IRanges(sample(2001L, sample_size, replace=TRUE),
    #                        width=sample(0:75, sample_size, replace=TRUE))
    # some_ranges <- head(some_ranges[end(some_ranges) <= 2000L], n=NE)
    # f <- factor(sample(length(dm3_upstream), NE, replace=TRUE),
    #             levels=seq_along(dm3_upstream))
    # at <- unname(split(some_ranges, f))

    # ### Takes < 1s on my machine.
    # res3a <- extractAt(dm3_upstream, at)

    ### Equivalent but about 250x slower than the above on my machine.
    #system.time(res3b <- DNAStringSetList(mapply(extractAt, dm3_upstream, at)))
    #expect_true(identical_XStringSetList(res3a, res3b))
})

test_that("old replaceAt tests still work", {
    ### On an XString object
    ### ====================

    ### Testing performance with half-million small substitutions at random
    ### locations in Celegans chrI:
    # library(BSgenome.Celegans.UCSC.ce2)
    # genome <- BSgenome.Celegans.UCSC.ce2
    # chrI <- genome$chrI
    # at4 <- randomDisjointRanges(1L, nchar(chrI), 0L, 20L, 500000L)
    # ### Takes < 1s on my machine (64-bit Ubuntu).
    # system.time(current <- replaceAt(chrI, at4, Views(chrI, at4)))
    # stopifnot(current == chrI)

    # ### Testing performance with half-million single-letter insertions
    # ### at random locations in Celegans chrI:
    # at5 <- randomDisjointRanges(1L, nchar(chrI), 0L, 0L, 500000L)
    # ### Takes < 1s on my machine (64-bit Ubuntu).
    # system.time(current <- replaceAt(chrI, at5, value="-"))
    # m <- matchPattern("-", current)
    # stopifnot(identical(sort(start(at5)), start(m) - seq_along(at5) + 1L))

    # system.time(current2 <- replaceAt(chrI, start(at5), value="-"))
    # stopifnot(identical(current, current2))

    # matchPattern("---", current)

    ### On an XStringSet object
    ### =======================

    x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ"))
    at <- IRangesList(IRanges(c(2, 1), c(3, 0)),
                      IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)),
                      IRanges(2, 2))
    ### Set inner names on 'at'.
    unlisted_at <- unlist(at)
    names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at)))
    at <- relist(unlisted_at, at)

    current <- replaceAt(x, at, value=extractAt(x, at))  # no-op
    expect_true(identical_XStringSet(x, current))

    res1a <- replaceAt(x, at)  # deletions
    res1b <- mendoapply(replaceAt, x, at)
    expect_true(identical_XStringSet(res1a, res1b))

    ## cutting this as well
    ### Testing performance with half-million single-letter insertions at random
    ### locations in Fly upstream sequences:
    ##
    # set.seed(33)
    # f <- factor(sample(length(dm3_upstream), 500000L, replace=TRUE),
    #             levels=seq_along(dm3_upstream))
    # at5 <- unname(split(sample(2001L, 500000L, replace=TRUE), f))
    # ### Takes < 1s on my machine.
    # system.time(res5a <- replaceAt(dm3_upstream, at5, value="-"))
    # ### Equivalent but about 1400x slower than the above on my machine.
    # system.time(res5b <- mendoapply(replaceAt,
    #                          dm3_upstream, as(at5, "List"), as("-", "List")))
    # stopifnot(identical_XStringSet(res5a, res5b))
})

Bioconductor/Biostrings documentation built on Nov. 11, 2024, 12:58 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-replaceAt.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings Efficient manipulation of biological strings

tests/testthat/test-replaceAt.R In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings
Efficient manipulation of biological strings

tests/testthat/test-replaceAt.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings