#' @import utils
utils::globalVariables(c("."))
#' @importFrom rlang .data
#' @importFrom futile.logger flog.namespace flog.trace flog.debug flog.info
#' @importFrom stringr str_extract str_replace str_c
#' @importFrom assertthat assert_that
#' @importFrom tibble tibble
#' @importFrom magrittr %>%
`%>%`
.onLoad <- function(libname, pkgname) { #nolint
backports::import(pkgname)
}
# Combine futile.logger and tictoc by putting the output of toc in a log message
flog_toc <- function(
level = c("INFO", "TRACE", "DEBUG", "WARN", "ERROR",
"FATAL", "CARP"),
func.toc = tictoc::toc.outmsg, #nolint
...
) {
level <- match.arg(level)
ffunc <-
switch(level,
TRACE = futile.logger::flog.trace,
DEBUG = futile.logger::flog.debug,
INFO = futile.logger::flog.info,
WARN = futile.logger::flog.warn,
ERROR = futile.logger::flog.error,
FATAL = futile.logger::flog.fatal,
CARP = futile.logger::flog.carp
)
toc <- tictoc::toc(quiet = TRUE)
ffunc(func.toc(toc$tic, toc$toc, toc$msg), ...)
}
#' Combine DADA2 \code{\link[dada2:derep-class]{derep}} objects into master map
#'
#' @param dereps a (possibly named) \code{list} of
#' (\code{\link[dada2:derep-class]{derep}} objects), or a
#' \code{\link[tibble]{tibble}} with a column "derep" containing such a
#' list.
#' @param .data a \code{\link[tibble]{tibble}} with the same number of rows as
#' the length of \code{dereps}.
#' @param ... additional columns to add to the output.
#'
#' @details To be useful for further analysis, each sequence should be uniquely
#' identified. This can be done in several ways:
#' \itemize{
#' \item{\code{dereps} is a named \code{list} with unique names;}
#' \item{\code{dereps} is a \code{\link[tibble]{tibble}} with columns
#' (other than "derep") which uniquely identify the rows;}
#' \item{\code{.data} is provided and its rows are unique; or}
#' \item{\code{...} is provided and its combinations are unique.}}
#'
#' @return \code{list} with two members: \describe{
#' \item{\code{$map} (\code{\link[tibble]{tibble}})}{with columns:
#' "\code{file}" (\code{character}), "\code{idx}" (\code{integer}), and
#' "\code{map}" (\code{integer}), giving the mapping from the
#' "\code{idx}"th sequence in "\code{file}" to a sequence in
#' "\code{fasta}"}
#' \item{\code{$fasta} (
#' \code{\link[Biostrings:XStringSet-class]{DNAStringSet-class}})}{all
#' unique sequences; the name of each sequence is an \code{integer}
#' which matches a value in \code{map$newmap}}
#' }
#' @export
combine_derep <- function(dereps, .data = NULL, ...) {
# handle different input types to get a tibble with the derep objects in one
# column
if (is.data.frame(dereps)) {
if (length(list(...)) > 0) {
dereps <- dplyr::bind_cols(
tibble(.placeholder = seq_along(nrow(dereps)),
...),
dereps)
dereps <- dplyr::select(dereps, -".placeholder")
}
} else {
n <- names(dereps)
dereps <- tibble(..., derep = dereps)
if (!hasName(dereps, "name") && !is.null(n)) dereps$name <- n
}
if (!missing(.data)) {
dereps <- dplyr::bind_cols(.data, dereps)
}
# Check that our derep objects are uniquely identified
gps <- setdiff(names(dereps), "derep")
assert_that(dplyr::n_distinct(dereps[gps]) == nrow(dereps))
# get all the old mappings
# preserve the sequence names as "seq_id" if they are present
oldmap <- dereps
oldmap[["oldmap"]] <- lapply(oldmap[["derep"]], `[[`, "map")
nestedcols <- "oldmap"
if (all(vapply(oldmap[["derep"]], assertthat::has_name, TRUE, "names"))) {
oldmap[["seq_id"]] <- lapply(oldmap[["derep"]], `[[`, "names")
nestedcols <- c(nestedcols, "seq_id")
}
oldmap <- dplyr::select(oldmap, -"derep") %>%
tidyr::unnest(cols = nestedcols) %>%
dplyr::group_by_at(gps) %>%
dplyr::mutate(idx = 1:dplyr::n()) %>%
dplyr::ungroup()
# get the old unique sequences
olduniques <- dereps %>%
dplyr::mutate_at("derep",
~purrr::map(., .f = ~tibble(seq = names(.$uniques),
n = .$uniques))) %>%
tidyr::unnest(cols = "derep")
# combine duplicate sequences among all files.
newuniques <- olduniques %>%
dplyr::group_by(seq) %>%
dplyr::summarize(n = sum(n)) %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::transmute(seq = seq,
newmap = seq_along(.data$seq))
# create a mapping from the unique sequences list in each file
# to the master unique sequence list
newderep <- olduniques %>% {
dplyr::left_join(
dplyr::select(., dplyr::one_of(gps), seq) %>%
dplyr::group_by_at(gps) %>%
dplyr::mutate(oldmap = seq_along(seq)) %>%
dplyr::ungroup(),
newuniques,
by = "seq")
}
out <- list()
# map from the individual sequences in each file to the master unique
# sequence list
out$map <- dplyr::left_join(oldmap,
dplyr::select(newderep, dplyr::one_of(gps),
"oldmap", "newmap"),
by = c(gps, "oldmap")) %>%
dplyr::select(-oldmap, map = "newmap")
#unique sequence list
out$fasta <- Biostrings::DNAStringSet(
x = purrr::set_names(newuniques$seq, newuniques$newmap)
)
class(out) <- c("multiderep", class(out))
return(out)
}
#' Produce a map between denoised sequences and their original identifiers.
#'
#' @param derep a \code{\link[dada2]{derep-class}} object or list of such
#' objects
#' @param dada (\link[dada2]{dada-class} object or list of such objects) the
#' results of a call to \code{\link[dada2]{dada}} on \code{"derep"}
#' @param seq_id (\code{character} or list of \code{character}) unique sequence
#' identifiers for the sequences in \code{"derep"}. These can
#' alternatively be provided using argument \code{"filename"} or
#' calling \code{\link{add_derep_names}} on \code{"derep"}.
#' @param filename (\code{character} of same length as \code{"derep"}) fasta/q
#' file that is the source for \code{"derep"}, used for setting
#' unique sequence IDs. If \code{"derep"} is a named \code{list}, then
#' the default is to assume that these filenames are the names of the
#' source files (as returned by \code{\link[dada2]{derepFastq}} when
#' multiple files are given). To avoid this behavior, pass an explicit
#' \code{NULL}.
#' @param ... additional columns to add to the output. Names included in the
#' output by default should be avoided.
#'
#' @details Columns \code{$derep_seq} and \code{$dada_seq} contain one sequence
#' per read as plain character strings. Keeping a separate, dereplicated
#' list of sequences and storing references to them may seem like it would
#' be more memory efficient, but it is not necessary to do this explicitly,
#' because this is what R already does with \code{character} vectors; only
#' one copy of each unique string is actually kept in memory, and everything
#' else is pointers.
#'
#' @return a \code{\link[tibble]{tibble}} with columns:
#' \describe{
#' \item{\code{$name} (\code{character})}{names from \code{derep}, if it
#' is a named list of \code{\link[=add_derep_names]{named_derep}}
#' objects. Otherwise absent unless provided in \code{...}.}
#' \item{\code{...} (\code{character})}{any additional arguments, passed
#' on to \code{\link[tibble]{tibble}}}
#' \item{\code{$seq_id} (\code{character})}{the sequence identifiers
#' from the original fasta/q file.}
#' \item{\code{$derep_idx} (\code{integer})}{the index of each sequence
#' in \code{derep$uniques}.}
#' \item{\code{$derep_seq} (\code{character})}{the sequences.}
#' \item{\code{$dada_seq} (\code{character})}{the denoised sequences.}
#' }
#' @export
dadamap <- function(derep, dada, filename = NULL, seq_id = NULL, ...) {
UseMethod("dadamap")
}
#' @export
dadamap.derep <- function(derep, dada, filename = NULL, seq_id = NULL, ...) {
if (is.null(seq_id)) {
if (is.null(filename)) {
if (hasName(derep, "names")) {
seq_id <- derep$names
} else if (hasName(derep, "seq_id")) {
seq_id <- derep$seq_id
} else {
stop(paste("sequence names for derep object must be given",
"using arguments 'seq_id', 'file', or by calling",
"'add_derep_names()' on the derep object."))
}
} else {
assertthat::assert_that(
assertthat::is.string(filename),
file.exists(filename)
)
seq_id <- names(
tryCatch(
Biostrings::readBStringSet(filename, format = "fasta"),
error = function(e) {
Biostrings::readBStringSet(filename, format = "fastq")
}
)
)
}
}
m <- tibble(
seq_id = seq_id,
derep_idx = derep$map,
derep_seq = names(derep$uniques)[.data$derep_idx],
...
)
m <- dplyr::left_join(
m,
tibble(
dada_idx = dada$map,
derep_idx = seq_along(.data$dada_idx),
dada_seq = dada$sequence[.data$dada_idx]
),
by = "derep_idx"
)
class(m) <- c("dadamap", class(m))
m
}
#' @rdname dadamap
#' @param dir (\code{character string}) A directory to look for files in.
#' @export
dadamap.list <- function(derep, dada, filename = names(derep), seq_id = NULL,
dir = NULL,
...) {
assert_that(assertthat::are_equal(length(derep), length(dada)))
assert_that(all(
purrr::map_lgl(derep, is.null) == purrr::map_lgl(dada, is.null)
))
for (i in seq_along(derep)) {
assert_that(methods::is(derep[[i]], "derep") || is.null(derep[[i]]))
assert_that(methods::is(dada[[i]], "dada") || is.null(dada[[i]]))
}
args <- list(..., derep = derep, dada = dada)
if (!is.null(names(derep))) {
args[["name"]] <- names(derep)
args <- args[tidyselect::vars_select(names(args), "name",
tidyselect::everything())]
}
if (!is.null(filename)) {
assert_that(
is.character(filename),
assertthat::are_equal(length(filename), length(derep))
)
if (!is.null(dir)) {
assert_that(
assertthat::is.string(dir),
dir.exists(dir)
)
filename <- file.path(dir, filename)
}
args[["filename"]] <- filename
}
if (!is.null(seq_id)) {
assert_that(
is.list(seq_id),
assertthat::are_equal(length(filename), length(derep)),
all(purrr::map_lgl(seq_id, is.character))
)
args[["seq_id"]] <- seq_id
}
args <- do.call(tibble::tibble, args) %>%
dplyr::filter(!purrr::map_lgl(derep, is.null))
out <- purrr::pmap_dfr(args, dadamap.derep)
class(out) <- c("dadamap", class(out))
out
}
#' Individually hash biological sequences
#'
#' @param seq (\code{character} or \code{\link[Biostrings]{XStringSet-class}})
#' the sequences to hash.
#' @param algo (\code{character}) a hash algorithm supported by
#' \code{\link[digest]{digest}}. default: "xxhash32"
#' @param len (\code{integer}) number of characters to keep from each hash
#' string. \code{NA} (the default) to keep all characters.
#' @param preserve_na (\code{logical}) If \code{TRUE}, \code{NA} values in
#' \code{seq} are preserved as \code{NA} in the output. If
#' \code{FALSE}, then \code{NA} is passed to
#' \code{\link[digest]{digest}}, which results in a valid hash.
#'
#' @return a \code{character} vector of the same length as \code{seq},
#' with the hashed sequences.
#' @export
seqhash <- function(seq, algo = "xxhash32", len = NA, preserve_na = TRUE) {
UseMethod("seqhash")
}
#' @rdname seqhash
#' @export
seqhash.character <- function(
seq,
algo = "xxhash32",
len = NA,
preserve_na = TRUE
) {
h <- vapply(seq, digest::digest, "", algo = algo)
if (preserve_na) h[is.na(seq)] <- NA_character_
if (is.na(len)) {
return(h)
} else {
substring(h, 1, len)
}
}
#' @rdname seqhash
#' @export
seqhash.XStringSet <- function(
seq,
algo = "xxhash32",
len = NA,
preserve_na = TRUE
) {
seqhash.character(as.character(seq), algo = algo, len = len, preserve_na)
}
#' Add sequence names to a derep object
#'
#' @param derep (object of class \code{\link[dada2:derep-class]{derep}} or a
#' \code{list} of such objects) object(s) to add names to.
#' @param ... passed to methods
#'
#' @return (object of class \code{derep}, or a list of such objects) a
#' shallow copy of \code{derep}, with an additional member
#' "\code{$names}", giving the identifiers for the sequences from the
#' original fasta/q file.
#' @export
add_derep_names <- function(derep, ...) UseMethod("add_derep_names")
#' @rdname add_derep_names
#' @param filename (\code{character}) name of fasta/q file that the
#' \code{\link[dada2:derep-class]{derep}} object was derived from.
#' @export
add_derep_names.derep <- function(derep, filename, ...) {
assert_that(file.exists(filename))
fqs <- ShortRead::FastqStreamer(filename, n = 1e4)
on.exit(close(fqs))
seqids <- character(0)
while (length(fq <- ShortRead::yield(fqs, qualityType = "FastqQuality"))) {
seqids <- c(seqids, as.character(fq@id))
}
derep[["names"]] <- seqids
return(derep)
}
#' @rdname add_derep_names
#' @param filenames (\code{character}) name(s) of file(s) that a \code{list} of
#' \code{\link[dada2:derep-class]{derep}} was derived from.
#' @export
add_derep_names.list <- function(derep, filenames = names(derep), ...) {
if (!all(inherits(derep, "derep") |
vapply(derep, is.na, TRUE))) {
stop("length of filenames and derep do not match")
}
assert_that(assertthat::are_equal(length(derep), length(names)))
derep2 <- mapply(add_derep_names.derep, derep, filenames)
names(derep2) <- filenames
return(derep2)
}
#' Summarize one or more reads as a \code{\link[tibble]{tibble}}.
#'
#' @param sread (\code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}} object or
#' list of such objects) as produced by
#' \code{\link[ShortRead]{readFastq}}.
#' @param max_ee (\code{numeric}) filter out reads with expected error greater
#' than \code{max_ee}. Default: Inf (no filtering)
#' @param ... (any vector) additional columns to add to the output
#' \code{\link[tibble]{tibble}}. These should have length 1 or, if
#' \code{sread} is a named list of
#' \code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}}, the length of
#' \code{sread}. Recycling behavior is as in
#' \code{\link[tibble]{tibble}}. Avoid the name "\code{name}" if
#' \code{sread} is a named \code{list}
#'
#' @return a \code{\link[tibble]{tibble}} with columns: \describe{
#' \item{\code{$seq_id} (\code{character})}{sequence IDs, typically from
#' the input fastq}
#' \item{\code{$seq} (\code{character})}{the actual sequence reads}
#' \item{\code{$name} (\code{character})}{the name of the source
#' \code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}}, if
#' \code{sread} was a list. Otherwise absent.}
#' \item{\code{...}}{any other arguments passed to summarize_sread.}}
#' If the input was empty, not a valid
#' \code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}}, or no sequences
#' passed filtering, a \code{\link[tibble]{tibble}} with zero rows is
#' returned.
#' @export
summarize_sread <- function(sread, ..., max_ee) UseMethod("summarize_sread")
#' @export
summarize_sread.ShortReadQ <- function(sread, ..., max_ee = Inf) {
if (!methods::is(sread, "ShortReadQ")) {
return(
tibble(
seq_id = character(),
seq = character()
)
)
}
out <- tibble(
seq_id = as.character(sread@id),
seq = as.character(sread@sread),
...)
ee <- rowSums(10 ^ (-1 * (methods::as(sread@quality, "matrix") / 10)),
na.rm = TRUE)
out <- out[ee <= max_ee, , drop = FALSE]
}
#' @export
summarize_sread.list <- function(sread, ..., max_ee = Inf) {
out <- tibble(
.seqs = lapply(sread, summarize_sread.ShortReadQ, max_ee = max_ee),
...
)
if (!is.null(names(sread)) && !hasName(out, "name")) {
out$name <- names(sread)
}
tidyr::unnest(out, ".seqs")
}
#' Test if all characters in a character vector are members of an alphabet.
#'
#' @param seq (\code{character}) character string(s) to test
#' @param alphabet (\code{character} with all elements of width 1)
#'
#' @return TRUE if all characters in \code{seq} are also in \code{alphabet}
#' @details This function internally uses regular expressions, so
#' \code{alphabet} should not begin with "^" or contain "\\". "-",
#' which commonly represents a gap, is handled correctly.
#' @export
has_alphabet <- function(seq, alphabet) {
regex <- paste0("^[", paste0(alphabet, collapse = ""), "]+$")
# make sure '-' is not interpreted as defining a character range
regex <- sub(x = regex, pattern = "-", replacement = "\\\\-")
return(all(grepl(pattern = regex, x = seq, perl = TRUE), na.rm = TRUE))
}
#' Calculate consensus of a cluster of sequences.
#'
#' This algorithm assumes that the sequences "should be" identical except for
#' amplification and sequencing errors. Its main purpose is to calculate a
#' consensus sequence for an amplicon that is too long to use in DADA2 directly,
#' but which has been clustered based on sequence variant identity in one
#' subregion.
#'
#' @param seq (\code{character} vector or
#' \code{\link[Biostrings]{XStringSet-class}})
#' The sequences to calculate a consensus for.
#' @param nread (\code{integer} vector) For the purposes of calculating the
#' consensus, consider each read to occur \code{nread} times. Supplying
#' unique values for \code{seq} along with the corresponding \code{nread}
#' is much faster than supplying duplicate reads to
#' \code{cluster_consensus}.
#' @param names (\code{character}) If \code{seq} is a \code{character} vector,
#' names for the sequences.
#' @param ncpus (\code{integer}) Number of CPUs to use.
#' @param simplify (\code{logical}) If \code{TRUE}, return an object of the same
#' type as \code{seq} containing a single sequence representing the
#' consensus. If \code{FALSE}, an object of the same type as \code{seq}
#' representing the consensus sequence for reads which were included in
#' the consensus, or \code{NA_character_} for reads which were initially
#' \code{NA} or which were removed from the consensus alignment as
#' outliers. For the \code{\link[Biostrings]{XStringSet-class}} method,
#' which does not allow \code{NA} entries, these elements are missing from
#' the set (this can be deduced by the names).
#' @param ... passed to methods
#'
#' @details The sequences are first aligned using
#' \code{\link[DECIPHER]{AlignSeqs}}. Sequences which are "outliers" in the
#' alignment are then removed by
#' \code{\link[odseq]{odseq}}. If the input sequences were clustered based on
#' DADA2 sequence variants of a variable region, and the sequences were
#' appropriately quality filtered prior to running \code{\link[dada2]{dada}},
#' then outliers should mostly be chimeras.
#'
#' After outlier removal, sites with greater than 50\% gaps are removed, and
#' the most frequent letter (ignoring gaps) is chosen at all other sites. If no
#' letter has greater than 50\% representation at a position, then an IUPAC
#' ambiguous base representing at least 50\% of the reads at that position is
#' chosen for nucleotide sequences, or \code{"X"} for amino acids.
#'
#' @return an \code{\link[Biostrings]{XStringSet-class}} representing the
#' consensus sequence.
#'
#' @export
cluster_consensus <- function(seq, nread = 1, ..., ncpus = 1, simplify = TRUE) {
UseMethod("cluster_consensus")
}
#' @param dna2rna (logical) whether to convert \code{seq} from DNA to RNA, and
#' use (calculated) RNA secondary structure in alignments.
#' @rdname cluster_consensus
#' @export
cluster_consensus.character <-
function(seq, nread = 1, names = base::names(seq), dna2rna = TRUE, ...,
ncpus = 1, simplify = TRUE) {
seq <- rlang::set_names(seq, names)
xss <- seq[!is.na(seq)]
nread <- nread[!is.na(seq)]
if (has_alphabet(xss, Biostrings::DNA_ALPHABET)) {
xss <- Biostrings::DNAStringSet(xss)
if (dna2rna) {
xss <- Biostrings::RNAStringSet(xss)
}
} else if (has_alphabet(xss, Biostrings::RNA_ALPHABET)) {
xss <- Biostrings::RNAStringSet(xss)
}
result <-
cluster_consensus.XStringSet(
xss,
nread = nread,
ncpus = ncpus,
simplify = simplify
)
if (simplify) {
if (length(result) == 1) return(as.character(result))
return(NA_character_)
}
seq[] <- NA_character_
seq[names(result)] <- as.character(result)
seq
}
#' @rdname cluster_consensus
#' @export
cluster_consensus.XStringSet <-
function(seq, nread = 1, ..., ncpus = 1, simplify = TRUE) {
if (length(nread) == 1) nread <- rlang::rep_along(seq, nread)
assertthat::assert_that(length(nread) == length(seq))
# a single sequence is its own consensus
if (length(seq) == 1) return(seq)
# it's not possible to make a consensus from 2 sequences that disagree
if (sum(nread) < 3) return(seq[FALSE])
# if a single sequence represents a majority of reads,
# then it is the consensus
if (max(nread) > sum(nread)/2) return(seq[which.max(nread)])
if (methods::is(seq, "RNAStringSet")) {
mult_align_class <- Biostrings::RNAMultipleAlignment
seqset_class <- "RNAStringSet"
} else if (methods::is(seq, "DNAStringSet")) {
mult_align_class <- Biostrings::DNAMultipleAlignment
seqset_class <- "DNAStringSet"
} else if (methods::is(seq, "AAStringSet")) {
mult_align_class <- Biostrings::AAMultipleAlignment
seqset_class <- "AAStringSet"
} else {
stop("Unknown sequence class")
}
flog.info("Calculating consensus of %d sequences (%d unique)...",
sum(nread),
length(seq))
tictoc::tic("cluster_consensus")
on.exit(flog_toc("DEBUG"))
flog.debug("Aligning...")
tictoc::tic("cluster_consensus:aligning")
aln <- DECIPHER::AlignSeqs(seq, processors = ncpus, verbose = FALSE)
flog_toc("TRACE")
flog.debug("Removing outliers...")
tictoc::tic("cluster_consensus:outliers")
outliers <- odseq(mult_align_class(aln), weights = nread)
flog.trace("Removed %d/%d sequences as outliers.",
sum(outliers), length(outliers))
aln <- aln[!outliers]
nread <- nread[!outliers]
flog_toc("TRACE")
flog.trace("Masking gaps...")
tictoc::tic("cluster_consensus:masking")
aln2 <- rep(aln, nread)
aln2 <- aln2 %>%
mult_align_class() %>%
Biostrings::maskGaps(min.fraction = 0.5, min.block.width = 1) %>%
methods::as(seqset_class)
flog_toc("TRACE")
flog.trace("Calculating consensus...")
tictoc::tic("cluster_consensus:consensus")
on.exit(flog_toc("TRACE"), add = TRUE, after = FALSE)
result <- DECIPHER::ConsensusSequence(
aln2,
threshold = 0.5,
ambiguity = TRUE,
ignoreNonBases = TRUE,
includeTerminalGaps = FALSE
)
if (simplify) return(result)
result <- rep(result, length(aln))
names(result) <- names(aln)
result
}
is_gzname <- function(name) endsWith(name, ".gz")
detect_format <- function(name, format) {
if (is.null(name)) return(NULL)
if (!is.null(format)) {
assertthat::assert_that(
assertthat::is.string(format),
format %in% c("fasta", "fastq")
)
return(format)
}
nogz <- gsub("\\.gz$", "", name)
format <- if (endsWith(nogz, ".fasta")) {
"fasta"
} else if (endsWith(nogz, ".fastq")) {
"fastq"
} else {
futile.logger::flog.warn(
"Unable to determine intended format of '%s'",
name
)
"fasta"
}
}
write_region <- function(
seq, outfile = NULL, append = FALSE, format = NULL, compress = NULL) {
UseMethod("write_region")
}
write_region.ShortRead <- function(
seq, outfile = NULL, append = FALSE, format = NULL, compress = NULL) {
if (is.null(outfile)) return()
format <- detect_format(outfile, format)
if (is.null(compress)) compress <- is_gzname(outfile)
mode <- if (isTRUE(append)) "a" else "w"
if (format == "fastq") {
ShortRead::writeFastq(
seq, outfile, mode = mode, compress = compress
)
} else {
ShortRead::writeFasta(
seq, outfile, mode = mode, compress = compress
)
}
}
write_region.XStringSet <- function(
seq, outfile = NULL, append = FALSE, format = NULL, compress = NULL) {
if (is.null(outfile)) return()
format <- detect_format(outfile, format)
if (is.null(compress)) compress <- is_gzname(outfile)
Biostrings::writeXStringSet(
seq, outfile, append = append, compress = compress, format = format
)
}
write_region.QualityScaledXStringSet <- function(
seq, outfile = NULL, append = FALSE, format = NULL, compress = NULL) {
if (is.null(outfile)) return()
format <- detect_format(outfile, format)
if (format == "fastq") {
if (is.null(compress)) compress <- is_gzname(outfile)
Biostrings::writeQualityScaledXStringSet(
seq, outfile, append = append, compress = compress
)
} else NextMethod()
}
get_id <- function(x) UseMethod("get_id")
get_id.ShortRead <- function(x) as.character(ShortRead::id(x))
get_id.default <- names
#' Extract regions from a set of sequences (maybe with qualities)
#'
#' @param seq (\code{character} scalar (a file name), an object belonging to
#' several classes representing nucleotide sequences, a \code{character}
#' vector of nucleotide sequences, or a \code{list} of any of these) the
#' sequences to extract regions from. To extract from several files, use
#' a \code{list} of single filenames, instead of a vector of filenames.
#' @param positions (\code{data.frame}) as returned by \code{\link[rITSx]{itsx}}
#' with \code{positions = TRUE} and \code{read_function} set; should have
#' columns \code{$seq_id} with sequence IDs (matching those in \code{seq}),
#' \code{$region} giving the name of each region, and \code{$start} and
#' \code{$end} giving the start and stop location, if found, of each
#' region.
#' @param region (\code{character}) The region to extract. Should match a value
#' given in \code{positions$region}.
#' @param region2 (\code{character}) If different from \code{region}, then the
#' entire segment beginning at the start of \code{region} and ending at
#' the end of \code{region2} will be extracted. For instance, to extract
#' the entire ITS region, use \code{region = 'ITS1', region2 = 'ITS2'}.
#' @param outfile (\code{character}) If given, the output will be written to the
#' filename given in fasta or fastq format. The format is determined by
#' \code{seq}, not by the extension of \code{outfile}.
#' @param append (\code{logical} scalar) if \code{TRUE}, then data is appended
#' to \code{outfile}; if \code{FALSE}, existing data in \code{outfile} is
#' overwritten.
#' @param format (\code{character}) File format to write (if \code{outfile} is
#' given). Default is to guess based on ".fasta[.gz]" or ".fastq[.gz]"
#' extension.
#' @param compress (\code{logical} scalar or NULL) Whether to gz-compress
#' \code{outfile}. Default is to detect based on presence of ".gz"
#' extension.
#' @param ... Passed to methods.
#'
#' @return (\code{object} of the same class as \code{seq}, or if \code{seq}
#' is a filename, \code{\link[Biostrings]{XStringSet-class}} or
#' \code{\link[Biostrings]{QualityScaledXStringSet-class}} depending on the
#' format of the file) The requested region from each of the input
#' sequences where it was found.
#' @export
extract_region <- function(seq, positions, region, region2 = region,
outfile = NULL, append = FALSE, format,
compress = NULL,
...)
UseMethod("extract_region")
do_extract_region <- function(seq, positions, region, region2 = region,
outfile = NULL, append = FALSE,
format = NULL, compress = NULL, ...) {
assert_that(
assertthat::is.string(region),
assertthat::is.string(region2),
assertthat::has_name(positions, "seq_id"),
assertthat::has_name(positions, "region"),
assertthat::has_name(positions, "start"),
assertthat::has_name(positions, "end"),
is.character(positions$seq_id),
is.character(positions$region),
is.integer(positions$start),
is.integer(positions$end)
)
if (!is.null(outfile)) {
assert_that(assertthat::is.string(outfile))
#create the output directory if needed
dir.create(dirname(outfile), recursive = TRUE, showWarnings = FALSE)
assert_that(dir.exists(dirname(outfile)))
# make sure the file exists even if we don't have anything to write.
if (file.exists(outfile) && !isTRUE(append)) file.remove(outfile)
if (!file.exists(outfile)) {
write_region(seq[FALSE], outfile, append = FALSE, format, compress)
}
}
# if the region is "full", then we don't need to cut anything.
if (region %in% c("full", "long", "short")) {
write_region(seq, outfile, append, format, compress)
return(seq)
}
p <- positions %>%
tidyr::gather(
key = "border", value = "loc",
"start", "end") %>%
dplyr::filter(
(.data$border == "start" & .data$region == !!region) |
(.data$border == "end" & .data$region == region2)
) %>%
dplyr::select(-"region") %>%
tidyr::spread(key = "border", value = "loc") %>%
dplyr::filter(
!is.na(.data$start),
.data$start > 0,
!is.na(.data$end),
.data$end > 0,
# end <= readr::parse_number(length),
.data$end > .data$start
)
idx <- tibble(
seq_id = get_id(seq),
idx = seq_along(.data$seq_id)
) %>%
dplyr::left_join(dplyr::select(p, "seq_id"), ., by = "seq_id") %>%
dplyr::pull("idx")
if (nrow(p)) {
out <- IRanges::narrow(seq[idx], start = p$start, end = p$end)
write_region(out, outfile, append = TRUE, format, compress)
} else {
out <- seq[FALSE]
}
return(out)
}
#' @param qualityType (\code{character} scalar) fastq file quality encoding; see
#' \code{\link[ShortRead]{readFastq}}.
#' @rdname extract_region
#' @export
extract_region.character <- function(seq, positions, region, region2 = region,
outfile = NULL,
append = FALSE,
format = NULL,
compress = NULL,
qualityType = "FastqQuality", #nolint
...) {
assert_that(assertthat::is.flag(append),
is.null(format) || format %in% c("fasta", "fastq"))
if (length(seq) == 1 && file.exists(seq)) {
assert_that(is.data.frame(positions))
seq <- tryCatch(
Biostrings::readBStringSet(seq, format = "fasta"),
error = function(e) {
methods::as(
ShortRead::readFastq(seq, qualityType = "FastqQuality"),
"QualityScaledXStringSet"
)
}
)
do_extract_region(seq = seq,
positions = positions,
region = region,
region2 = region2,
outfile = outfile,
append = append,
format = format,
compress = compress,
...)
} else {
seq <- Biostrings::BStringSet(seq)
out <- do_extract_region(seq, positions, region, region2, outfile,
qualityType, append, format, compress, ...)
as.character(out)
}
}
#' @rdname extract_region
#' @export
extract_region.XStringSet <- do_extract_region
#' @rdname extract_region
#' @export
extract_region.ShortRead <- do_extract_region
#' @rdname extract_region
#' @export
extract_region.list <- function(seq, positions, region, region2 = region,
outfile = NULL, ...) {
assert_that(length(positions) == length(seq))
if (!is.null(outfile) && !append) unlink(outfile)
purrr::map2(
.x = seq,
.y = positions,
.f = extract_region,
region = region,
region2 = region2,
outfile = outfile,
append = TRUE,
...
)
}
str_modify <- function(x, regex = NULL, replace = NULL, ...) {
if (!is.null(regex) && !is.na(regex)) {
assert_that(assertthat::is.string(regex))
if (is.null(replace) || is.na(replace)) {
str_extract(x, regex, ...)
} else {
assert_that(assertthat::is.string(replace))
str_replace(x, regex, replace, ...)
}
} else {
x
}
}
output_spec <- function(output, order) {
if (is.character(output)) {
assert_that(assertthat::is.string(output))
output <- magrittr::set_names(list(order), output)
} else {
assert_that(
is.list(output),
rlang::is_named(output)
)
}
assert_that(
all(purrr::map_lgl(output, is.character)),
all(purrr::map_lgl(output, ~all(. %in% order)))
)
output
}
#' Reconstruct a longer region out of ASVs or consensus sequence of individual
#' domains.
#'
#' The sequences from each denoised sub-region/domain are concatenated to create
#' a denoised sequence
#' for the long region. Additionally, de-novo bimera detection is performed
#' using \code{\link[dada2]{isBimeraDenovo}} or
#' \code{\link[dada2]{isBimeraDenovoTable}} on
#' sets of three consecutive sub-regions/domains; in the intended application,
#' these sets will be variable--conserved--variable.
#'
#' When not all sub-regions/domains for a given read have been successfully
#' denoised with DADA, then the missing regions are constructed using
#' \code{\link{cluster_consensus}}.
#'
#' @param seqtabs (\code{list} of \code{data.frame}) with columns
#' \code{read_column}, \code{asv_column}, and optionally
#' \code{sample_column}. Any additional columns are ignored.
#' \code{read_column} should give a unique ID for each sequencing read, and
#' \code{asv_column} should give the denoised sequence for the read.
#' @param regions (\code{character} vector with the same length as
#' \code{seqtabs}) The names of the regions/domains represented by each of
#' the tables in \code{seqtabs}. If not supplied, then \code{seqtabs}
#' should be named by the regions.
#' @param regions_regex (\code{character} scalar, or \code{NULL})
#' A \link[stringi:stringi-search-regex]{regular expression}. If
#' \code{regions_regex} is given but \code{regions_replace} is not, then
#' only the part of the entries in \code{regions} matching the regex are
#' used to define samples (using \code{\link[stringr]{str_extract}}). If
#' \code{regions_replace} is also used, then the regex is instead replaced
#' by \code{regions_replace} (using \code{\link[stringr]{str_replace}}).
#' \code{NA_character} is treated the same way as \code{NULL}.
#' @param regions_replace (\code{character} scalar, or \code{NULL})
#' Replacement string for \code{regions_regex}.
#' \code{NA_character} is treated the same way as \code{NULL}.
#' @param output (\code{character} scalar or named list of \code{character}
#' vectors) If a \code{character} scalar, then the name to be used for the
#' (single) output region. In this case the region will be the
#' concatenation of all the regions in \code{order}. Alternatively, a list
#' where the names are the names of the output regions, and the values are
#' \code{character} vectors giving the regions which should be concatenated
#' for each output region.
#' @param use_output (one of \code{"first"}, \code{"second"}, or \code{"no"}) If
#' one of the regions given by \code{output} is also present in
#' \code{seqtabs}, then the \code{seqtabs} version is used preferentially
#' \code{use_output == "first"}, as a backup value when one of the
#' subregions/domains is missing if \code{use_output == "second"}, or not
#' at all if \code{use_output == "no"}.
#' @param order (\code{character} vector) The order in which the
#' sub-regions/domains should be concatenated to produce the output(s).
#' @param read_column (\code{character} scalar) Column name from the
#' \code{seqtabs} which uniquely identifies each read (but different
#' regions extracted from the same read should have the same ID.)
#' @param asv_column (\code{character} scalar) Column name from the
#' \code{seqtabs} which gives the denoised sequences.
#' @param rawtabs (\code{list} of \code{data.frame}) Data sources of the same
#' format as \code{seqtabs}, with columns \code{read_column} and
#' \code{raw_column}. These should be of the same number as
#' \code{seqtabs}, and correspond to the sub-regions/domains specified in
#' \code{regions}. The default is to look for \code{raw_column} in
#' \code{seqtabs}.
#' @param raw_column (\code{character} scalar, or \code{NULL})
#' Column name from the \code{seqtabs} which gives the raw sequences. If
#' \code{NULL} or \code{NA_character_}, then consensus sequences will not
#' be used as a backup when no denoised sequence is present.
#' @param raw_regions (\code{character} vector with the same length as
#' \code{rawtabs}) The names of the regions/domains represented by each
#' of the tables in \code{rawtabs}. These will be processed using
#' \code{regions_regex} and \code{regions_replace}, if given.
#' @param sample_column (\code{character} scalar, or \code{NULL}) An optional
#' column name from the \code{seqtabs} which identifies which sample each
#' sequence is from. If given, this is used (after possible modification
#' by \code{sample_regex} and \code{sample_replace}) to identify
#' different samples for \code{\link[dada2]{isBimeraDenovoTable}}.
#' \code{NA_character} is treated the same way as \code{NULL}.
#' @param sample_regex (\code{character} scalar, or \code{NULL}) A
#' \link[stringi:stringi-search-regex]{regular expression}. If
#' \code{sample_regex} is given but \code{sample_replace} is not, then
#' only the part of the entries in \code{sample_column} matching the
#' regex are used to define samples (using
#' \code{\link[stringr]{str_extract}}). If \code{sample_replace} is also
#' used, then the regex is instead replaced by \code{sample_replace}
#' (using \code{\link[stringr]{str_replace}}). \code{NA_character} is
#' treated the same way as \code{NULL}.
#' @param sample_replace (\code{character} scalar, or \code{NULL})
#' Replacement string for \code{sample_regex}. \code{NA_character} is
#' treated the same way as \code{NULL}.
#' @param chimera_offset (\code{integer}) By default, bimeras are checked for
#' sub-region/domains 1, 2, 3; 3, 4, 5; 5, 6, 7; etc. This is appropriate
#' if the domains alternate variable, conserved, variable, etc. If a
#' more conserved domain is first, use \code{chimera_offset = 1}.
#' @param allow_map (\code{logical} scalar) If \code{TRUE} and if \code{asvs}
#' contains non-missing values, attempt to map each raw read without a
#' corresponding ASV to the nearest ASV.
#' @param allow_consensus (\code{logical} scalar) If \code{TRUE} and if
#' \code{allow_map} is \code{FALSE} or there are no non-missing values in
#' \code{asvs}, then attempt to make a consensus of all raw reads.
#' @param allow_raw (\code{logical} scalar) If \code{TRUE}, then after mapping
#' and/or consensus building, remaining raw reads are taken as they are.
#' If \code{FALSE}, the corresponding results will be \code{NA}.
#' @param ... additional arguments passed to \code{\link[dada2]{isBimeraDenovo}}
#' or \code{\link[dada2]{isBimeraDenovoTable}}.
#'
#' @return a \code{\link[tibble]{tibble}} with column "\code{seq_id}" and
#' \code{sample_column} (if given), as well as one column for each value
#' of \code{regions} and \code{output}, representing the
#' sub-regions/domains and the concatenated full region.
#' @export
reconstruct <- function(
seqtabs,
regions = names(seqtabs),
regions_regex = NULL,
regions_replace = NULL,
output = "concat",
use_output = c("first", "second", "no"),
order = regions,
read_column = "seq_id",
asv_column = "dada_seq",
rawtabs = seqtabs,
raw_column = NULL,
raw_regions = names(rawtabs),
sample_column = NULL,
sample_regex = NULL,
sample_replace = NULL,
chimera_offset = 0,
allow_map = TRUE,
allow_consensus = TRUE,
allow_raw = FALSE,
...
) {
assert_that(
is_string_or_missing(raw_column),
is.character(order)
)
use_output <- match.arg(use_output)
if (is.null(raw_column) || is.na(raw_column)) raw_column <- NULL
regions <- str_modify(regions, regions_regex, regions_replace)
assert_that(all(order %in% regions))
output <- output_spec(output, order)
region_table <- assemble_region_table(
seqtabs = seqtabs,
regions = regions,
output = output,
order = order,
read_column = read_column,
seq_column = asv_column,
sample_column = sample_column,
sample_regex = sample_regex,
sample_replace = sample_replace
)
chims <- find_all_region_chimeras(
region_table = region_table,
order = order,
sample_column = sample_column,
read_column = read_column,
chimera_offset = chimera_offset)
if (!is.null(raw_column)) {
raw_regions <- str_modify(regions, regions_regex, regions_replace)
raw_table <- assemble_region_table(
seqtabs = rawtabs,
regions = raw_regions,
output = output,
order = order,
read_column = read_column,
seq_column = raw_column,
sample_column = sample_column,
sample_regex = sample_regex,
sample_replace = sample_replace
)
raw_table <- dplyr::semi_join(raw_table, region_table, by = read_column)
region_table <- consensus_missing_regions(
region_table = region_table,
raw_table = raw_table,
order = order,
read_column = read_column,
allow_map = allow_map,
allow_consensus = allow_consensus,
allow_raw = allow_raw,
...
)
}
for (o in names(output)) {
this_chims <- purrr::map_lgl(chims$chimset, ~all(. %in% output[[o]]))
this_chims <- chims$chims[this_chims]
this_chims <- unlist(this_chims)
this_chims <- unique(this_chims)
this_chims <- which(region_table[[read_column]] %in% this_chims)
region_table <- reconstruct_region(
region_table = region_table,
output = o,
use_output = use_output,
order = output[[o]],
chims = this_chims
)
}
region_table
}
reconstruct_region <- function(region_table, output, order, use_output,
chims = integer(0)) {
if (output %in% names(region_table) && use_output != "no") {
region_table[["_concat_"]] <-
do.call(
str_c,
region_table[, order]
)
if (use_output == "first") {
region_table[[output]] <-
dplyr::coalesce(
region_table[[output]],
region_table[["_concat_"]]
)
} else {
region_table[[output]] <-
dplyr::coalesce(
region_table[["_concat_"]],
region_table[[output]]
)
}
region_table[["_concat_"]] <- NULL
} else {
region_table[[output]] <-
do.call(
str_c,
region_table[, order]
)
}
region_table[[output]][chims] <- NA_character_
region_table
}
#' Combine raw and denoised reads from multiple regions of the same sequences.
#'
#' @param dadamap (\code{\link{dadamap}} object)
#' @param rawdata (\code{\link[tibble]{tibble}}) as returned by
#' \code{\link{summarize_sread}}
#'
#' @details Both of the inputs should be annotated with identifying columns to
#' uniquely identify the source \code{\link[dada2:derep-class]{derep}} objects;
#' this should happen automatically if \code{\link[dada2]{derepFastq}} and
#' \code{\link[ShortRead]{readFastq}} are called on a list of filenames (there
#' will be a column "name" in the outputs of \code{\link{dadamap}} and
#' \code{\link{summarize_sread}}).
#'
#' @return a \code{\link[tibble]{tibble}} giving the sequences for each region
#' for each read (uniquely identified by all other columns)
#' @export
combine_bigmaps <- function(dadamap, rawdata) {
joincols <- intersect(colnames(dadamap), colnames(rawdata))
dplyr::full_join(dadamap, rawdata, by = joincols) %>%
dplyr::group_by("seq_id") %>%
dplyr::filter(any(!is.na(.data$dada_idx))) %>%
dplyr::mutate(
seq = dplyr::coalesce(.data$dada_seq, .data$derep_seq, .data$seq)
) %>%
dplyr::select(-"derep_seq", -"derep_idx", -"dada_seq", -"dada_idx")
}
is_string_or_missing <- function(x) {
is.null(x) || is.na(x) || assertthat::is.string(x)
}
assemble_region_table <- function(
seqtabs,
regions = names(seqtabs),
output = "concat",
order = regions,
read_column = "seq_id",
seq_column = "dada_seq",
sample_column = NULL,
sample_regex = NULL,
sample_replace = NULL
) {
output <- output_spec(output, order)
assert_that(
is.list(output),
rlang::is_named(output),
assertthat::is.string(read_column),
assertthat::is.string(seq_column),
is_string_or_missing(sample_column),
is_string_or_missing(sample_regex),
is_string_or_missing(sample_replace)
)
if (!is.null(sample_column) && is.na(sample_column)) sample_column <- NULL
seqtabs <- purrr::map2(
seqtabs,
regions,
function(st, reg) magrittr::set_names(
st[, c(sample_column, read_column, seq_column)],
c(sample_column, read_column, reg)))
seqtabs <- purrr::map(
unique(regions) %>% magrittr::set_names(., .),
function(r) {
dplyr::bind_rows(seqtabs[regions == r])
}
)
if (!is.null(sample_column)) {
seqtabs <- purrr::map(
seqtabs,
dplyr::mutate_at,
sample_column,
str_modify,
sample_regex,
sample_replace
)
}
all_regions <- c(order, setdiff(intersect(regions, names(output)), order))
out <- purrr::reduce(
seqtabs[all_regions],
dplyr::full_join,
by = c(sample_column, read_column)
)
# for (o in names(output)) {
# if (o %in% regions) {
# out <- dplyr::full_join(
# out,
# seqtabs[[o]],
# by = c(sample_column, read_column)
# )
# }
# }
out
}
#' Find chimeras in adjacent variable-conserved-variable domain triplets
#'
#' @param region_table (\code{data.frame} containing all the regions in
#' \code{order})
#' @param order (\code{character} with the regions to check)
#' @param sample_column (\code{character} scalar) extra column in region_table
#' which identifies different samples; if present, use
#' \code{\link[dada2]{isBimeraDenovoTable}} instead of
#' \code{\link[dada2]{isBimeraDenovo}}.
#' @param read_column (\code{character} scalar) column in region_table
#' which identifies different reads.
#' @param chimera_offset (\code{integer} scalar) use \code{1} if the first
#' region is conserved.
#' @param ... passed to \code{find_region_chimeras}.
#'
#' @return a \code{\link[tibble]{tibble}} with columns:
#' \describe{
#' \item{chimset}{(\code{list} of \code{character}) the set of regions
#' (usually 3) which were checked in each row.}
#' \item{chims}{\code{list} of \code{character}) the sequence IDs of
#' the reads which were identified as chimeric for each set of
#' regions}}
find_all_region_chimeras <- function(
region_table,
order,
sample_column = NULL,
read_column = "read_id",
chimera_offset = 0,
...
) {
result <- tibble(
first = seq(1 + 0, max(1, length(order) - 2), 2),
last = pmin(length(order), .data$first + 2),
chimset = purrr::map2(.data$first, .data$last, ~order[.x:.y]),
chims = purrr::map(
.data$chimset,
find_region_chimeras,
region_table = region_table,
sample_column = sample_column,
read_column = read_column,
...
)
)
dplyr::select(result, "chimset", "chims")
}
#' Check for bimeras in a subset of regions
#'
#' @param region_table (\code{data.frame} containing all the regions in
#' \code{chimset})
#' @param chimset (\code{character} vector, usually of length 3) regions to
#' concatenate and then check for bimeras.
#' @param sample_column (\code{character} scalar) extra column in region_table
#' which identifies different samples; if present, use
#' \code{\link[dada2]{isBimeraDenovoTable}} instead of
#' \code{\link[dada2]{isBimeraDenovo}}.
#' @param read_column (\code{character} scalar) column in region_table
#' which identifies different reads.
#' @param ... passed on to \code{\link[dada2]{isBimeraDenovoTable}} or
#' \code{\link[dada2]{isBimeraDenovo}}.
#'
#' @return an \code{character} vector giving the read IDs of
#' \code{region_table} which were detected as bimeras.
find_region_chimeras <- function(region_table, chimset, sample_column,
read_column, ...) {
seqs <- do.call(str_c, region_table[, chimset])
chimset_name <- paste(chimset, collapse = "--")
flog.trace("Searching for bimeras in regions %s.", chimset_name)
tictoc::tic(paste(chimset_name), "bimeras")
chims <-
if (!is.null(sample_column)) {
tibble(sample = region_table[[sample_column]], seq = seqs) %>%
dplyr::filter(!is.na(seq)) %>%
dplyr::group_by(sample, seq) %>%
dplyr::summarize(nread = dplyr::n()) %>%
dplyr::ungroup() %>%
tidyr::spread(seq, "nread", fill = 0L) %>%
tibble::column_to_rownames("sample") %>%
as.matrix() %>%
dada2::isBimeraDenovoTable(...)
} else {
table(seqs) %>%
tibble(abundance = ., sequence = names(.)) %>%
as.data.frame() %>%
dada2::isBimeraDenovo(...)
}
flog_toc("TRACE")
flog.debug("Found %d/%d bimeric ASVs (%d/%d reads) in regions %s.",
sum(chims), length(chims),
sum(seqs %in% names(chims)[chims], na.rm = TRUE),
sum(!is.na(seqs)),
chimset_name)
region_table[[read_column]][seqs %in% names(chims)[chims]]
}
block_consensus <- function(.x, .y, reg, reg2, reg2_raw, read_column, ...) {
if (length(.y[[reg]]) == 0 || is.na(.y[[reg]])) return(.x)
.x[[reg2]] <- map_or_consensus(
asvs = .x[[reg2]],
raw = .x[[reg2_raw]],
names = .x[[read_column]],
...
)
.x
}
consensus_missing_regions <- function(
region_table,
raw_table,
order,
maxdist = 10,
read_column = "seq_id",
...
) {
# all combinations of ASV sequences
combos <- dplyr::group_by_at(region_table, order) %>%
dplyr::summarize(nreads = dplyr::n()) %>%
dplyr::ungroup()
# how many variants do we have for each region?
# sort decreasing, because we want to start with the most variable region
region_counts <- vapply(
order,
function(x) dplyr::n_distinct(combos[[x]], na.rm = TRUE),
1L
) %>%
sort(decreasing = TRUE)
out_table <- dplyr::left_join(region_table, raw_table, by = read_column,
suffix = c("", "_raw"))
for (reg in names(region_counts)) {
flog.info("Starting to process %s clusters.", reg)
tictoc::tic(paste(reg, "clusters"))
out_table <- dplyr::group_by_at(out_table, reg)
regstats <- dplyr::summarise_at(
out_table,
setdiff(order, reg),
list(
n_na = ~sum(is.na(.)),
all_na = ~all(is.na(.)),
n = length
)
)
for (reg2 in setdiff(order, reg)) {
all_na <- paste0(reg2, "_all_na")
n_na <- paste0(reg2, "_n_na")
flog.info(
"Interpolating %s values for %s ASVs.", reg2, reg)
tictoc::tic(paste0(reg, " clusters (", reg2, ")"))
flog.info(
"%d missing %s reads found in %d %s clusters with %s ASVs",
sum(regstats[[n_na]][!regstats[[all_na]] &
!is.na(regstats[[reg]])]),
reg2,
sum(
!regstats[[all_na]] &
regstats[[n_na]] > 0 &
!is.na(regstats[[reg]])
),
reg,
reg2
)
flog.info(
"%d missing %s reads found in %d %s clusters with no %s ASVs.",
sum(regstats[[n_na]][regstats[[all_na]] & !is.na(regstats[[reg]])]),
reg2,
sum(regstats[[all_na]] & !is.na(regstats[[reg]])),
reg,
reg2
)
reg2_raw <- paste0(reg2, "_raw")
out_table <- dplyr::group_map(
.tbl = out_table,
.f = block_consensus,
reg = reg,
reg2 = reg2,
reg2_raw = reg2_raw,
read_column = read_column,
...,
keep = TRUE
) %>%
dplyr::bind_rows() %>%
dplyr::group_by_at(reg)
regstats <- dplyr::summarise_at(
out_table,
setdiff(order, reg),
list(
n_na = ~sum(is.na(.)),
all_na = ~all(is.na(.)),
n = length
)
)
flog_toc()
flog.info(
"%d missing %s reads remain in %d %s clusters with %s ASVs",
sum(regstats[[n_na]][!regstats[[all_na]] &
!is.na(regstats[[reg]])]),
reg2,
sum(
!regstats[[all_na]] &
regstats[[n_na]] > 0 &
!is.na(regstats[[reg]])
),
reg,
reg2
)
flog.info(
"%d missing %s reads remain in %d %s clusters with no %s ASVs.",
sum(regstats[[n_na]][regstats[[all_na]] & !is.na(regstats[[reg]])]),
reg2,
sum(regstats[[all_na]] & !is.na(regstats[[reg]])),
reg,
reg2
)
}
flog_toc()
}
dplyr::select_at(out_table, names(region_table))
}
#' Replace unmapped raw reads with the nearest ASV
#'
#' @param asvs (\code{character} vector) ASV sequences mapped to a set of reads.
#' Should be \code{\link{NA_character_}} for reads which did not map to an
#' ASV.
#' @param raw (\code{character} vector) Raw read sequences for the same set of
#' reads as \code{asvs}. May be \code{\link{NA_character_}}
#' @param maxdist (\code{numeric} scalar) Maximum Levenshtein distance between
#' a raw read and an ASV for the read to be mapped to the ASV.
#'
#' @return a \code{character} vector the same length as \code{asvs}, which has
#' the closest ASV for each read.
#'
#' @details The value for element \code{i} of the result is determined as
#' follows:
#' \enumerate{
#' \item{\code{asvs[i]} is non-\code{NA}: the value from \code{asvs} is
#' used.}
#' \item{\code{asvs[i]} and \code{raw[i]} are both \code{NA}:
#' \code{\link{NA_character_}}}
#' \item{\code{asvs[i]} is \code{NA}, \code{raw[i]} is non-\code{NA}:
#' \enumerate{
#' \item{\code{raw[i]} is less than \code{maxdist} in edit
#' distance from at least one of the non-\code{NA} sequences
#' in \code{asvs}: the value from \code{asvs} which has the
#' smallest edit distance from \code{raw[i]} is used.}
#' \item{\code{raw[i]} is not less than \code{maxdist} in edit
#' distance from at least one of the non-\code{NA} sequences
#' in \code{asvs}: \code{NA_character_}}}}}
#'
#' @export
map_to_best_asv <- function(asvs, raw, maxdist = 10) {
assert_that(
is.character(asvs),
is.character(raw),
length(asvs) == length(raw),
assertthat::is.number(maxdist),
maxdist >= 0
)
if (!anyNA(asvs)) return(asvs)
unique_asvs <- purrr::discard(unique(asvs), is.na)
required_raw <- raw[is.na(asvs)]
unique_raw <- purrr::discard(unique(required_raw), is.na)
if (length(unique_raw) == 0) return(asvs)
flog.debug(
paste("Calculating distance matrix between %d raw sequences",
"and %d ASVs."),
length(unique_raw),
length()
)
tictoc::tic("Distance matrix")
d <- adist(unique_raw, unique_asvs)
flog_toc("DEBUG")
replace_table <- tibble(
raw = unique_raw,
match = unique_asvs[apply(d, MARGIN = 1, which.min)],
dist = apply(d, MARGIN = 1, min)
)
replace_table <- dplyr::filter(
replace_table,
.data$dist <= maxdist
)
out <- dplyr::left_join(tibble(raw = raw), replace_table, by = "raw")
dplyr::coalesce(asvs, out$match)
}
#' Assign consensus sequences to unmapped reads
#'
#' This function is intended to be run on reads from a single sub-region/domain,
#' which have been clustered using a linked sub-region/domain; for instance,
#' ITS1 reads clustered based on identity/similarity of the linked ITS2 reads.
#'
#' If some of the target reads have been mapped to ASVs, then
#' \code{map_or_consensus} attempts to map additional raw reads to the same
#' ASVs using a (potentially) more relaxed criteria than
#' \code{\link[dada2]{dada}}. This is implemented in
#' \code{\link{map_to_best_asv}}.
#'
#' If, on the other hand, none of the input sequences have been assigned to an
#' ASV, then the entire group is taken to represent one cluster, and a consensus
#' sequence for the cluster is determined using \code{\link{cluster_consensus}}.
#' This process will remove outliers (generally chimeric in origin) and assign
#' \code{NA_character_} to the associated reads, as well as any reads which are
#' already \code{NA} due to quality filtering, failed region extraction, etc.
#'
#' @param asvs (\code{character} vector) ASV sequences mapped to a set of reads.
#' Should be \code{\link{NA_character_}} for reads which did not map to an
#' ASV.
#' @param raw (\code{character} vector) Raw read sequences for the same set of
#' reads as \code{asvs}. May be \code{\link{NA_character_}}
#' @param maxdist (\code{numeric} scalar) Maximum Levenshtein distance between
#' a raw read and an ASV for the read to be mapped to the ASV.
#' @param allow_map (\code{logical} scalar) If \code{TRUE} and if \code{asvs}
#' contains non-missing values, attempt to map each raw read without a
#' corresponding ASV to the nearest ASV.
#' @param allow_consensus (\code{logical} scalar) If \code{TRUE} and if
#' \code{allow_map} is \code{FALSE} or there are no non-missing values in
#' \code{asvs}, then attempt to make a consensus of all raw reads.
#' @param allow_raw (\code{logical} scalar) If \code{TRUE}, then after mapping
#' and/or consensus building, remaining raw reads are taken as they are. If
#' \code{FALSE}, the corresponding results will be \code{NA}.
#' @param ... passed to \code{\link{cluster_consensus.character}}
#'
#' @return a \code{character} vector the same length as \code{asvs}, which has
#' the closest ASV for each read if any ASVs are non-missing, or the cluster
#' consensus values for raw reads which were non-missing and not outliers.
#' @export
map_or_consensus <- function(asvs, raw, maxdist = 10, allow_map = TRUE,
allow_consensus = TRUE, allow_raw = FALSE, ...) {
assert_that(
is.character(asvs),
is.character(raw),
length(asvs) == length(raw),
assertthat::is.number(maxdist),
maxdist >= 0,
assertthat::is.flag(allow_map),
assertthat::is.flag(allow_consensus),
assertthat::is.flag(allow_raw)
)
result <- asvs
if (!all(is.na(asvs)) && allow_map) {
result <- map_to_best_asv(asvs, raw, maxdist)
} else if (allow_consensus) {
result <- cluster_consensus(raw, simplify = FALSE, ...)
}
if (allow_raw) {
result <- dplyr::coalesce(result, raw)
}
result
}
gather_regions <- function(pos) {
tidyr::pivot_longer(
pos,
cols = c(tidyselect::ends_with("_start"), tidyselect::ends_with("_end")),
names_to = c("region", ".value"),
names_pattern = "(.+)_(start|end)"
)
}
spread_regions <- function(pos) {
tidyr::pivot_wider(
pos,
names_from = "region",
values_from = c("start", "end")
) %>%
magrittr::set_names(
stringr::str_replace(names(.), "(end|start)_(.+)", "\\2_\\1")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.