## Brendan Furneaux
## May 2021
methods::setClass(
"SpoaAlgo",
slots = c(
algorithm = "character",
m = "integer",
n = "integer",
g = "integer",
e = "integer",
q = "integer",
c = "integer"
),
prototype = list(
algorithm = NA_character_,
m = NA_integer_,
n = NA_integer_,
g = NA_integer_,
e = NA_integer_,
q = NA_integer_,
c = NA_integer_
)
)
#' Specify an algorithm for SPOA
#' @param m (non-negative `integer`) score for a match. *Default*: `5L`
#' @param n (non-positive `integer`) score for a mismatch. *Default*: `-4L`
#' @param g (non-positive `integer`) gap opening penalty. *Default*: `-8L`
#' @param e (non-positive `integer`) gap extension penalty. *Default*: same
#' value as `g`
#' @param q (non-positive `integer`) second gap opening penalty.
#' *Default*: same value as `g`
#' @param c (non-positive `integer`) second gap extension penalty.
#' *Default*: same value as `e`
#' @param algorithm (`character` string) alignment mode; one of `"local"`
#' (Smith-Watterman), `"global"` (Needleman-Wunsch), or `"semi.global"`
#' (Overlap). *Default*: `"local"`
#'
#' @return an object of class "`SpoaAlgo`", suitable for passing to
#' [spoaAlign()] or [spoaConsensus()]
#' @rdname spoaAlign
SpoaAlgo <- function(algorithm = c("local", "global", "semi.global"),
m = 5L, n = -4L, g = -8L, e = g, q = g, c = q) {
algorithm <- match.arg(algorithm)
assertthat::assert_that(
isNonnegativeScalarInt(m),
isNonpositiveScalarInt(n),
isNonpositiveScalarInt(g),
isNonpositiveScalarInt(e),
isNonpositiveScalarInt(q),
isNonpositiveScalarInt(c)
)
methods::new("SpoaAlgo",
algorithm = algorithm,
m = as.integer(m),
n = as.integer(n),
g = as.integer(g),
e = as.integer(e),
q = as.integer(q),
c = as.integer(c)
)
}
#' Align sequences using SPOA (and optionally get the consensus)
#'
#' @param seq (`character` vector, [`Biostrings::XStringSet-class`],
#' [`ShortRead::ShortRead-class`], or [`dada2::derep-class`]) sequences to
#' align.
#' @param algo (`SpoaAlgo` object) parameters for SPOA, generated by
#' [SpoaAlgo()].
#' @param w (`integer` vector) weights for the sequences. *Default*: `1L`
#' @param ... additional parameters passed to methods
#'
#' @return For `spoaConsensus()`, either a `character` string or the
#' appropriate [`Biostrings::XString-class`], depending on the class of `seq`.
#'
#' For `spoaAlign()`, either a
#' `character` vector or a [`Biostrings::MultipleAlignment-class`], depending on
#' the class of `seq`. If `seq` is a `BStringSet` (i.e., an `XStringSet` which
#' is not specifically DNA, RNA, or AA) then the result is also a `BStringSet`,
#' since there is no corresponding "`BMultipleAlignment`" class. If `seq` is a
#' [`ShortRead::ShortRead-class`] object, the output is a
#' [`Biostrings::DNAMultipleAlignment-class`].
#'
#' | **seq**| **spoaConsensus** | **spoaAlign** |
#' |---------|---------------|-----------|
#' |`character(n)`|`character(1)` | `character(n)`|
#' |[`BStringSet`][Biostrings::XStringSet-class]|[`BString`][Biostrings::XString-class]|[`BStringSet`][Biostrings::XStringSet-class]|
#' |[`DNAStringSet`][Biostrings::XStringSet-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`RNAStringSet`][Biostrings::XStringSet-class]|[`RNAString`][Biostrings::XString-class]|[`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`AAStringSet`][Biostrings::XStringSet-class]|[`AAString`][Biostrings::XString-class]|[`AAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledBStringSet`][Biostrings::QualityScaledXStringSet-class]|[`BString`][Biostrings::XString-class]|[`BStringSet`][Biostrings::XStringSet-class]|
#' |[`QualityScaledDNAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledRNAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`RNAString`][Biostrings::XString-class]|[`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledAAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`AAString`][Biostrings::XString-class]|[`AAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`ShortReadQ`][ShortRead::ShortReadQ-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`derep`][dada2::derep-class]|`character(1)`| `character(n)`|
#'
#' @details
#'
#' ## Gap penalties
#'
#' The general (convex) gap penalty formula is:
#'
#' `min(g + (i - i) * e, q + (i - 1) * c)`
#'
#' For `q == g` and `c == e` (as is the case unless `q` or `c` is explicitly
#' set), this simplifies to the affine gap penalty:
#'
#' `g + (i - 1) * e`
#'
#' If, additionally, `e == g` (the default), then this is the linear gap
#' penalty:
#'
#' `g * i`
#'
#' ## Weighting
#'
#' Assigning a weight *w* to a sequence is equivalent to including that sequence
#' *w* times; this is primarily useful for dereplicated sequences, where all
#' sequences are unique, and the weight represents their multiplicity.
#'
#' Note that POA is not order-independent, so results may differ when a set of
#' non-unique sequences is dereplicated. For the most predictable results, it
#' is recommended to sort dereplicated sequences in decreasing order of
#' abundance (as in [dada2::derepFastq()]).
#'
#' ## Quality scores
#'
#' If `seq` is an object which includes quality scores
#' ([`Biostrings::QualityScaledXStringSet-class`], [`ShortRead::ShortReadQ`],
#' or [`dada2::derep-class`])
#' then the alignment consensus is weighted using the quality scores. The method
#' used by SPOA uses the integer quality scores as per-position weights
#' analogous to sequence weights; i.e., if two different bases align in the same
#' position, one in a single sequence with quality score 40 and the other in a
#' single sequence with quality score 20, then this is equivalent to the first
#' base occurring at that position in 40 sequences without quality scores, and
#' the second base occurring at that position in 20 sequences without quality
#' scores.
#'
#' It is possible to use a combination of sequence weights and quality scores.
#' In this case, in order for a dereplicated sequence set to give the same
#' results as the non-dereplicated sequences, the quality score for each of the
#' dereplicated sequences should be the mean of the quality scores for the
#' corresponding non-dereplicated sequences. This is the method used to generate
#' quality scores for dereplicated sequences in [dada2::derepFastq()].
#'
#' @examples
#' sequences <- c(
#' "CATAAAAGAACGTAGGTCGCCCGTCCGTAACCTGTCGGATCACCGGAAAGGACCCGTAAAGTGATAATGAT",
#' "ATAAAGGCAGTCGCTCTGTAAGCTGTCGATTCACCGGAAAGATGGCGTTACCACGTAAAGTGATAATGATTAT",
#' "ATCAAAGAACGTGTAGCCTGTCCGTAATCTAGCGCATTTCACACGAGACCCGCGTAATGGG",
#' "CGTAAATAGGTAATGATTATCATTACATATCACAACTAGGGCCGTATTAATCATGATATCATCA",
#' "GTCGCTAGAGGCATCGTGAGTCGCTTCCGTACCGCAAGGATGACGAGTCACTTAAAGTGATAAT",
#' "CCGTAACCTTCATCGGATCACCGGAAAGGACCCGTAAATAGACCTGATTATCATCTACAT"
#' )
#' spoaAlign(sequences)
#' spoaConsensus(sequences)
#' @export
spoaAlign <- function(seq, algo = SpoaAlgo(), ...) {
UseMethod("spoaAlign")
}
#' @param qual (`list` of `numeric` vectors) per-base quality scores for the
#' sequences. *Default*: none
#' @export
#' @rdname spoaAlign
spoaAlign.character <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
assertthat::assert_that(methods::is(algo, "SpoaAlgo"))
checkWeights(seq, w)
if (length(w) == 1L) w <- rep(w, length(seq))
msa <- rep(NA_character_, length(seq))
names(msa) <- names(seq)
no_na <- !is.na(seq)
msa[no_na] <- if (!is.null(qual)) {
checkQuals(seq, qual)
if (all(vapply(qual, is.integer, TRUE))) {
spoa_align_intqual(
seq[no_na],
qual[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
} else {
spoa_align_dblqual(
seq[no_na],
qual[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
}
} else {
spoa_align(
seq[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
}
msa
}
#' @export
#' @rdname spoaAlign
spoaAlign.XStringSet <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
s <- spoaAlign.character(as.character(seq), algo, w, qual, ...)
matchXMultipleAlignment(s, seq)
}
#' @export
spoaAlign.ShortRead <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
requireShortRead()
seq2 <- ShortRead::sread(seq)
names(seq2) <- as.character(ShortRead::id(seq))
spoaAlign.XStringSet(seq2, algo, w, qual = NULL, ...)
}
#' @export
#' @rdname spoaAlign
spoaAlign.QualityScaledXStringSet <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
requireBiostrings()
s <- spoaAlign.character(as.character(seq), algo, w,
qual = as.list(methods::as(Biostrings::quality(seq), "IntegerList"))
)
matchXMultipleAlignment(s, seq)
}
#' @export
#' @rdname spoaAlign
spoaAlign.ShortReadQ <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
requireShortRead()
requireBiostrings()
s <- methods::as(seq, "QualityScaledDNAStringSet")
names(s) <- as.character(ShortRead::id(seq))
spoaAlign.QualityScaledXStringSet(s, algo, w, ...)
}
#' @export
#' @rdname spoaAlign
spoaAlign.derep <- function(seq, algo = SpoaAlgo(), ...) {
spoaAlign.character(
names(seq$uniques),
algo,
w = seq$uniques,
qual = apply(seq$quals, 1, stats::na.omit)
)
}
#' @rdname spoaAlign
#' @export
spoaConsensus <- function(seq, algo = SpoaAlgo(), ...) {
UseMethod("spoaConsensus")
}
#' @export
#' @rdname spoaAlign
spoaConsensus.character <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
assertthat::assert_that(methods::is(algo, "SpoaAlgo"))
checkWeights(seq, w)
if (length(w) == 1L) w <- rep(w, length(seq))
no_na <- !is.na(seq)
if (!is.null(qual)) {
checkQuals(seq, qual)
if (all(vapply(qual, is.integer, TRUE))) {
spoa_consensus_intqual(
seq[no_na],
qual[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
} else {
spoa_consensus_dblqual(
seq[no_na],
qual[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
}
} else {
spoa_consensus(
seq[no_na],
algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
w[no_na]
)
}
}
#' @export
#' @rdname spoaAlign
spoaConsensus.XStringSet <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
s <- spoaConsensus.character(as.character(seq), algo, w, qual, ...)
matchXString(s, seq)
}
#' @export
#' @rdname spoaAlign
spoaConsensus.ShortRead <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
requireShortRead()
seq2 <- ShortRead::sread(seq)
names(seq2) <- as.character(ShortRead::id(seq))
spoaConsensus.XStringSet(seq2, algo, w, qual, ...)
}
#' @export
#' @rdname spoaAlign
spoaConsensus.QualityScaledXStringSet <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
requireBiostrings()
if (length(w) == 1L) w <- rep(w, length(seq))
s <- spoaConsensus.character(
as.character(seq),
algo = algo,
w = w,
qual = as.list(methods::as(Biostrings::quality(seq), "IntegerList")),
...
)
matchXString(s, seq)
}
#' @export
#' @rdname spoaAlign
spoaConsensus.ShortReadQ <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
requireShortRead()
requireBiostrings()
spoaConsensus.QualityScaledXStringSet(
methods::as(seq, "QualityScaledDNAStringSet"),
algo, w, ...
)
}
#' @export
#' @rdname spoaAlign
spoaConsensus.derep <- function(seq, algo = SpoaAlgo(), ...) {
spoaConsensus.character(
names(seq$uniques),
algo,
w = seq$uniques,
qual = apply(seq$quals, 1, stats::na.omit),
...
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.