# --- to be included in an upcoming biocDatasets package
randomDNASequences <- function(n, w)
{
alphabet <- DNA_BASES
w <- rep(w, length=n)
sequences <- sapply(seq(1, n, length=n),
function(x) {
s <- sample(alphabet, w[x], replace=TRUE)
s <- paste(s, collapse="")
return(s)
})
return(Biostrings::DNAStringSet(sequences))
}
msubseq <- function(x, ir)
{
## differs from subseq in the sense that several subsequences
## from the same sequence are extracted
## x: XString
## ir: IRanges
res <- vector("character", length = length(ir))
for (i in seq(along=res)) {
res[i] <- as.character(subseq(x, start=ir@start[i], width=width(ir)[i]))
## forced cast: chain of tools for DNAString seems interupted for
## some use cases (or I missed something)
}
res <- DNAStringSet(res)
return(res)
}
# ---
test_pdictConstantWidth <- function()
{
set.seed(1)
l <- 150
target <- randomDNASequences(1, l)[[1]]
W <- 20
L <- 6
ir <- successiveIRanges(rep(W, L), gapwidth = 1)
short_sequences <- msubseq(target, ir)
# shuffle the sequences (they are not in consecutive order)
o <- sample(seq(along=short_sequences))
dna_short <- DNAStringSet(short_sequences[o])
pdict <- PDict(dna_short)
checkEquals(L, length(pdict))
checkEquals(rep(W, L), width(pdict))
checkEquals(NULL, head(pdict))
checkEquals(W, tb.width(pdict))
checkEquals(NULL, tail(pdict))
}
test_pdictVariableWidth <- function()
{
set.seed(1)
l <- 150
target <- randomDNASequences(1, l)[[1]]
W <- 20
L <- 6
n_cut <- sample(0:5, L, replace=TRUE)
ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)])
short_sequences <- msubseq(target, ir)
# shuffle the sequences (they are not in consecutive order)
o <- sample(seq(along=short_sequences))
dna_var_short <- DNAStringSet(short_sequences[o])
pdict <- PDict(dna_var_short,
tb.start=1, # can't this be
tb.width=min(width(short_sequences)) # the default for
# variable width ?
)
checkEquals(L, length(pdict))
checkEquals( (rep(W, L) - n_cut)[o], width(pdict))
checkEquals(NULL, head(pdict))
shortest_seq_width <- min(width(dna_var_short))
checkEquals(shortest_seq_width,
tb.width(pdict)) # mostly a sanity check
checkEquals(substring(short_sequences, shortest_seq_width+1)[o],
as.character(tail(pdict)))
}
test_matchConstantWidth <- function()
{
set.seed(1)
l <- 150
dna_target <- randomDNASequences(1, l)[[1]]
W <- 20
L <- 6
ir <- successiveIRanges(rep(W, L), gapwidth = 1)
short_sequences <- msubseq(dna_target, ir)
# shuffle the sequences (so they are not in consecutive order)
o <- sample(seq(along=short_sequences))
dna_short <- DNAStringSet(short_sequences[o])
pdict <- PDict(dna_short)
res <- matchPDict(pdict, dna_target)
# mostly a sanity check
checkEquals(L, length(res))
for (i in seq(along=res)) {
m_start <- ir[o][i]@start
checkEquals(m_start, start(res[[i]]))
checkEquals(W, width(res[[i]]))
checkEquals(m_start + W - 1, end(res[[i]])) # mostly a sanity check
}
}
test_matchVariableWidth <- function()
{
set.seed(1)
l <- 150
dna_target <- randomDNASequences(1, l)[[1]]
W <- 20
L <- 6
n_cut <- sample(0:5, L, replace=TRUE)
ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)])
short_sequences <- msubseq(dna_target, ir)
# shuffle the sequences (they are not in consecutive order)
o <- sample(seq(along=short_sequences))
dna_var_short <- DNAStringSet(short_sequences[o])
pdict <- PDict(dna_var_short,
tb.start=1, # can't this be
tb.width=min(width(dna_var_short)) # the default for
# variable width ?
)
res <- matchPDict(pdict, dna_target)
# mostly a sanity check
checkEquals(L, length(res))
iro <- ir[o]
for (i in seq(along=res)) {
checkEquals(start(iro)[i], start(res[[i]]))
checkEquals(width(iro)[i], width(res[[i]]))
checkEquals(end(iro)[i], end(res[[i]])) # mostly a sanity check
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.