Biostrings: Efficient manipulation of biological strings

# --- to be included in an upcoming biocDatasets package
randomDNASequences <- function(n, w)
{
  alphabet <- DNA_BASES
  w <- rep(w, length=n)
  sequences <- sapply(seq(1, n, length=n),
                      function(x) {
                        s <- sample(alphabet, w[x], replace=TRUE)
                        s <- paste(s, collapse="")
                        return(s)
                      })
  return(Biostrings::DNAStringSet(sequences))
}

msubseq <- function(x, ir)
{
  ## differs from subseq in the sense that several subsequences
  ## from the same sequence are extracted
  ## x:  XString
  ## ir: IRanges
  res <- vector("character", length = length(ir))
  for (i in seq(along=res)) {
    res[i] <- as.character(subseq(x, start=ir@start[i], width=width(ir)[i]))
    ## forced cast: chain of tools for DNAString seems interupted for
    ##              some use cases (or I missed something)
  }
  res <- DNAStringSet(res)
  return(res)
}


# ---

test_pdictConstantWidth <- function()
{
  set.seed(1)
  l <- 150
  target <- randomDNASequences(1, l)[[1]]
  W <- 20
  L <- 6 
  ir <- successiveIRanges(rep(W, L), gapwidth = 1)
  short_sequences <- msubseq(target, ir)
  # shuffle the sequences (they are not in consecutive order)
  o <- sample(seq(along=short_sequences))
  
  dna_short <- DNAStringSet(short_sequences[o])
  pdict <- PDict(dna_short)
  checkEquals(L, length(pdict))
  checkEquals(rep(W, L), width(pdict))
  checkEquals(NULL, head(pdict))
  checkEquals(W, tb.width(pdict))
  checkEquals(NULL, tail(pdict))
}

test_pdictVariableWidth <- function()
{
  set.seed(1)
  l <- 150
  target <- randomDNASequences(1, l)[[1]]
  W <- 20
  L <- 6
  n_cut <- sample(0:5, L, replace=TRUE)
  ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)])
  short_sequences <- msubseq(target, ir)
  # shuffle the sequences (they are not in consecutive order)
  o <- sample(seq(along=short_sequences))
  
  dna_var_short <- DNAStringSet(short_sequences[o])
  
  pdict <- PDict(dna_var_short,
                 tb.start=1,                        # can't this be
                 tb.width=min(width(short_sequences)) # the default for
                                                    # variable width ?
                 )
  checkEquals(L, length(pdict))
  checkEquals( (rep(W, L) - n_cut)[o], width(pdict))
  checkEquals(NULL, head(pdict))
  shortest_seq_width <- min(width(dna_var_short))
  checkEquals(shortest_seq_width,
              tb.width(pdict))           # mostly a sanity check
  checkEquals(substring(short_sequences, shortest_seq_width+1)[o],
              as.character(tail(pdict)))
}


test_matchConstantWidth <- function()
{
  set.seed(1)
  l <- 150
  dna_target <- randomDNASequences(1, l)[[1]]
  W <- 20
  L <- 6 
  ir <- successiveIRanges(rep(W, L), gapwidth = 1)
  short_sequences <- msubseq(dna_target, ir)
  # shuffle the sequences (so they are not in consecutive order)
  o <- sample(seq(along=short_sequences))
  
  dna_short <- DNAStringSet(short_sequences[o])
  pdict <- PDict(dna_short)
  
  res <- matchPDict(pdict, dna_target)

  # mostly a sanity check
  checkEquals(L, length(res))
  
  for (i in seq(along=res)) {
    m_start <- ir[o][i]@start
    checkEquals(m_start, start(res[[i]]))
    checkEquals(W, width(res[[i]]))
    checkEquals(m_start + W - 1, end(res[[i]]))  # mostly a sanity check
  }
  
  
}

test_matchVariableWidth <- function()
{
  set.seed(1)
  l <- 150
  dna_target <- randomDNASequences(1, l)[[1]]
  W <- 20
  L <- 6
  n_cut <- sample(0:5, L, replace=TRUE)
  ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)])
  short_sequences <- msubseq(dna_target, ir)
  # shuffle the sequences (they are not in consecutive order)
  o <- sample(seq(along=short_sequences))
  
  dna_var_short <- DNAStringSet(short_sequences[o])
  
  pdict <- PDict(dna_var_short,
                 tb.start=1,                        # can't this be
                 tb.width=min(width(dna_var_short)) # the default for
                                                    # variable width ?
                 )

  res <- matchPDict(pdict, dna_target)

  # mostly a sanity check
  checkEquals(L, length(res))
  
  iro <- ir[o]
  for (i in seq(along=res)) {
    checkEquals(start(iro)[i], start(res[[i]]))
    checkEquals(width(iro)[i], width(res[[i]]))
    checkEquals(end(iro)[i], end(res[[i]]))  # mostly a sanity check
  }
    
}

Bioconductor/Biostrings documentation built on June 7, 2024, 10:18 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Bioconductor/Biostrings
Efficient manipulation of biological strings

inst/unitTests/test_matchPDict.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings Efficient manipulation of biological strings

inst/unitTests/test_matchPDict.R In Bioconductor/Biostrings: Efficient manipulation of biological strings

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/Biostrings
Efficient manipulation of biological strings

inst/unitTests/test_matchPDict.R
In Bioconductor/Biostrings: Efficient manipulation of biological strings