# Match Primers -----------------------------------------------------------
#' Make data frame of primer info for reads
#'
#' `make_read_primer_table` take read sequences and per-locus primer
#' information and produces a data frame of matching information for each read.
#' Mismatches are allowed, but not indels. The rows in the output data frame
#' will correspond exactly to the read vector given.
#'
#' Output Columns:
#'
#' * SeqOrig: Original read sequence
#' * FwdStart: start position of forward primer match
#' * Fwdstop: end position of forward primer match
#' * FwdMismatches: number of mismatches for the forward primer
#' * FwdLocus: name of the Locus whose forward primer matched
#' * RevStart: start position of reverse primer match
#' * RevStop: end position of reverse primer match
#' * RevMismatches: number of mismatches in the reverse primer
#' * RevLocus: name of the Locus whose reverse primer matched
#' * MatchingLocus: name of Locus matching in forward or both (if
#' `use_reverse_primers`) directions
#' * Seq: modified sequence based on `primer_action` argument(s), if
#' applicable
#'
#' Optionally reads can be modified based on the matched primer sequences in one
#' or both directions; see the `primer_action` arguments for those options. All
#' positions in the returned data frame are indexed from 1 and are oriented in
#' the same direction as the unmodified reads.
#'
#' The comparison of reads to primers is not exhaustive. This method gathers
#' exact matches first and searches the remaining reads for inexact matches in
#' order of decreasing abundance of exact matches, taking the first
#' sufficiently-similar match for each read. For `max_mismatches` values well
#' below the similarity between any two primer sequences (as will typically be
#' the case) this should result in the best-matching primer being reported for
#' each read, but is much faster than doing an exhaustive all-to-all comparison.
#'
#' @param seqs character vector of read sequences
#' @param locus_attrs data frame of locus attributes
#' @param max_mismatches integer number of mismatches allowed when checking
#' primers against reads
#' @param primer_action how should reads be modified based on matched primers?
#' Can be "none", "keep" (trim sequences to the primers but keep the primers
#' as-is), "replace" (trim sequences *and* replace the matched primer region
#' with the sequence from `locus_attrs`), or "remove" (trim sequences to
#' exclude the primer region).
#' @param primer_action_fwd a `primer_action` value for the forward primer
#' @param primer_action_rev a `primer_action` value for the reverse primer
#' @param max_mismatches_fwd a `max_mismatches` value for the forward primer
#' @param max_mismatches_rev a `max_mismatches` value for the reverse primer
#' @param use_reverse_primers use reverse primers, or just forward?
#' @param reverse_primer_r1 is each reverse primer in `locus_attrs` given in its
#' orientation on the forward read? This is used to determine how the primers
#' and reads should be reverse complemented before comparing.
#' @returns data frame of matched primer information, one row for each input
#' sequence
#' @export
#' @md
make_read_primer_table <- function(
seqs, locus_attrs,
max_mismatches = cfg("max_mismatches"),
primer_action = cfg("primer_action"),
primer_action_fwd = cfg("primer_action_fwd"),
primer_action_rev = cfg("primer_action_rev"),
max_mismatches_fwd = cfg("max_mismatches_fwd"),
max_mismatches_rev = cfg("max_mismatches_rev"),
use_reverse_primers = cfg("use_reverse_primers"),
reverse_primer_r1 = cfg("reverse_primer_r1")) {
if (is_blank(max_mismatches_fwd)) {
max_mismatches_fwd <- max_mismatches
}
if (is_blank(max_mismatches_rev)) {
max_mismatches_rev <- max_mismatches
}
if (is_blank(primer_action_fwd)) {
primer_action_fwd <- primer_action
}
if (is_blank(primer_action_rev)) {
primer_action_rev <- primer_action
}
# Find matching loci, by primer sequence, for each of a set of sequences,
# within a maximum distance. The output rows will match the input sequences.
match_primer_set <- function(
seqs, locus_attrs, max_mismatches, colnm, prefix, do_revcmp) {
primers <- locus_attrs[[colnm]]
if (do_revcmp) {
primers <- revcmp(primers)
}
matches <- find_primer_matches(seqs, primers, max_mismatches)
# To ensure output and input match up, interpolate NA rows where needed
matches <- matches[with(matches, order(SeqIdx, Mismatches)), ]
matches <- matches[match(seq_along(seqs), matches$SeqIdx), ]
matches$SeqIdx <- seq_along(seqs)
# Assign a locus based on the primer match
matches$Locus <- locus_attrs$Locus[matches$PrimerIdx]
# Get rid of this index, since it only makes sense relative to the primer
# data frame in here
matches$PrimerIdx <- NULL
# Add a custom prefix to each column
mod <- colnames(matches) != "SeqIdx"
colnames(matches)[mod] <- paste0(prefix, colnames(matches)[mod])
matches
}
# Find a matching locus via forward primers
matches_fwd <- match_primer_set(
seqs, locus_attrs, max_mismatches, "Primer", "Fwd", FALSE)
result <- matches_fwd
# Same for reverse, if we're using those
if (use_reverse_primers) {
matches_rev <- match_primer_set(
seqs, locus_attrs, max_mismatches, "ReversePrimer", "Rev",
! reverse_primer_r1)
result <- merge(result, matches_rev, by.x = "SeqIdx", sort = FALSE)
result$MatchingLocus <- result$FwdLocus
result$MatchingLocus[with(
result, is.na(FwdLocus) | is.na(RevLocus) | FwdLocus != RevLocus)] <- NA
} else {
result$MatchingLocus <- result$FwdLocus
}
result <- result[order(result$SeqIdx), ]
result <- cbind(SeqOrig = seqs, result, stringsAsFactors = FALSE)
result <- subset(result, select = -SeqIdx)
result <- handle_primers(
result, locus_attrs,
primer_action_fwd, primer_action_rev, reverse_primer_r1)
result
}
#' Modify reads based on matched primers
#'
#' `handle_primers` carries out the read modification specified in
#' [make_read_primer_table].
#'
#' @param result data frame of read and primer information as from
#' [make_read_primer_table]
#' @param locus_attrs data frame of locus attributes
#' @param primer_action_fwd keyword for handling forward primers (see
#' [make_read_primer_table])
#' @param primer_action_rev keyword for handling reverse primers (see
#' [make_read_primer_table])
#' @param reverse_primer_r1 is each reverse primer in `locus_attrs` given in its
#' orientation on the forward read?
#' @returns modified data frame with `Seq` column containing modified sequences
#' @md
handle_primers <- function(
result, locus_attrs, primer_action_fwd, primer_action_rev,
reverse_primer_r1) {
# Handling this as two steps for both forward and reverse parts:
# 1) substring to appropriate region
# 2) add any replacement strings, for the special case of "replace"
if (primer_action_rev != "none" && ! all(
c("RevStart", "RevStop") %in% colnames(result))) {
stop(
paste0(
"can't apply primer_action_rev \"",
primer_action_rev,
"\" without reverse primer information"))
}
start_poses <- switch(
primer_action_fwd,
none = 1,
keep = result$FwdStart,
replace = result$FwdStop + 1,
remove = result$FwdStop + 1)
if (is.null(start_poses)) {
stop("primer_action_fwd should be one of none/keep/replace/remove")
}
end_poses <- switch(
primer_action_rev,
none = nchar(result$SeqOrig),
keep = result$RevStop,
replace = result$RevStart - 1,
remove = result$RevStart - 1)
if (is.null(end_poses)) {
stop("primer_action_rev should be one of none/keep/replace/remove")
}
result$Seq <- substring(result$SeqOrig, start_poses, end_poses)
if (primer_action_fwd == "replace") {
prefix <- locus_attrs$Primer[
match(result$MatchingLocus, locus_attrs$Locus)]
result$Seq <- paste0(prefix, result$Seq)
}
if (primer_action_rev == "replace") {
suffix <- locus_attrs$ReversePrimer[
match(result$MatchingLocus, locus_attrs$Locus)]
if (! reverse_primer_r1) {
suffix <- revcmp(suffix)
}
result$Seq <- paste0(result$Seq, suffix)
}
idx <- is.na(result$MatchingLocus)
result$Seq[idx] <- result$SeqOrig[idx]
result
}
#' Find primer matches for reads
#'
#' Returns a data frame of index values for the two input vectors for each
#' match, along with start and stop positions within each read and a count of
#' base mismatches. There may be zero, one, or multiple output rows per input
#' read.
#'
#' Indels are not supported. Partial matches are supported at the 3' end and
#' are counted as mismatches.
#'
#' If `max_mismatches` is `NA`, the best match for every read and primer
#' combination will be included in the output. If `max_mismatches` is an
#' integer, at most one row will be provided for each read (for the first
#' discovered primer at or below that number of mismatches). In this mode the
#' primers are checked for perfect matches first and then re-checked in
#' decreasing order of abundance to find any imperfect matches. This means that
#' a search with a maximum mismatch count specified will be much faster than one
#' without, but could give counterintuitive results for some edge cases. For
#' example, with max mismatches of 10 and a read containing primer A at 5
#' mismatches and primer B at 8 mismatches, either A or B may be matched
#' depending on the relative abundance of perfect matches found elsewhere. In
#' practice, with dissimilar primer sequences and a low number of allowed
#' mismatches, this is unlikely to be a problem.
#'
#' @param seqs_reads character vector of read sequences
#' @param seqs_primers character vector of primer sequences
#' @param max_mismatches integer number of mismatches allowed when checking
#' primers against reads, or `NA` to check all combinations
#' @returns data frame of read and primer index pairs and match details
#' @md
find_primer_matches <- function(
seqs_reads, seqs_primers, max_mismatches = cfg("max_mismatches")) {
# Any NAs will be handled in the same way as empty strings
seqs_reads[is.na(seqs_reads)] <- ""
seqs_primers[is.na(seqs_primers)] <- ""
matches <- ordered(
character(length(seqs_reads)),
levels = unique(seqs_primers[seqs_primers != ""]))
result_stub <- data.frame(
SeqIdx = integer(),
PrimerIdx = integer(),
Start = integer(),
Stop = integer(),
Mismatches = integer())
if (length(seqs_reads) == 0 || length(levels(matches)) == 0) {
return(result_stub)
}
# Assign exact matches wherever observed, each time ignoring reads already
# matched. Using a munged version of the reads with some padding so we can
# find partial matches around the edges (without dealing with actual gaps)
if (! is.na(max_mismatches)) {
padding <- paste0(rep("X", max(0, nchar(levels(matches)))), collapse = "")
seqs_for_grep <- paste0(padding, seqs_reads, padding)
for (seq_primer in levels(matches)) {
idxl_mask <- is.na(matches)
idx <- grep(seq_primer, seqs_for_grep[idxl_mask])
matches[idxl_mask][idx] <- seq_primer
}
# reorder levels to prioritize the most abundant for all that follows
matches <- stats::reorder(matches, -table(matches)[matches])
# Second pass: approximate matches, in addition to those exact matches
for (seq_primer in levels(matches)) {
idxl_mask <- is.na(matches)
idx <- agrep(
seq_primer,
seqs_for_grep[idxl_mask],
max.distance = list(ins = 0, del = 0, all = max_mismatches))
matches[idxl_mask][idx] <- seq_primer
}
matches <- stats::reorder(matches, -table(matches)[matches])
}
# Now check actual mismatch details
len_max <- max(nchar(seqs_reads))
# In this big matrix, each row is a position, each column a query; in the
# colSums below, primers are compared as vectors column-wise to this matrix.
# (Using raw bytes as that makes the comparisons significantly faster)
queries <- make_raw_nt(seqs_reads, map = RAW_NT[1:4])
slices <- seq_len(len_max)
result <- do.call(rbind, lapply(levels(matches), function(seq_primer) {
# Each per-primer data frame here will initially have a row for every read.
# If max_mismatches is an integer we'll only assign to rows for reads
# already determined to have a match for each primer, and then remove NA
# rows in the end. If max_mismatches is NA, we'll compare all to all and
# keep all entries.
# Empty primer sequences always yield NA rows out.
result <- result_stub[seq_along(seqs_reads), ]
result$SeqIdx <- seq_along(seqs_reads)
result$PrimerIdx <- match(seq_primer, seqs_primers)
primer <- make_raw_nt(seq_primer, pad = 0x40)[, 1]
if (length(primer) > 1) {
# If a max mismatch was given, we'll only check the reads that showed a
# match for this primer. Otherwise we'll check everything.
if (is.na(max_mismatches)) {
idxl_reads <- seq_along(seqs_reads)
queries_here <- queries
} else {
idxl_reads <- matches %in% seq_primer
queries_here <- queries[, idxl_reads, drop = FALSE]
}
if (ncol(queries_here) > 0) {
for (start_pos in slices) {
end_pos <- min(len_max, start_pos + nchar(seq_primer) - 1L)
# to handle cases where the primer runs past the end of the reads,
# we'll count those bases as mismatches
truncate <- length(primer) - (end_pos - start_pos + 1)
primer_segment <- primer[1:(end_pos - start_pos + 1)]
chunk <- queries_here[start_pos:end_pos, , drop = FALSE]
# If a pair of bytes have no overlapping bits, it's a mismatch. Note
# that also includes the 0x00 we're using as the padding value.
mismatches <- truncate + colSums(
(chunk & primer_segment) == as.raw(0))
idxl_match <- with(
result,
is.na(Mismatches[idxl_reads]) | mismatches < Mismatches[idxl_reads])
result$Start[idxl_reads][idxl_match] <- start_pos
result$Stop[idxl_reads][idxl_match] <- end_pos
result$Mismatches[idxl_reads][idxl_match] <- as.integer(
mismatches[idxl_match])
}
}
}
subset(result, ! is.na(Mismatches))
}))
rownames(result) <- NULL
result
}
# Util --------------------------------------------------------------------
#' IUPAC DNA nucleotides as raw bytes
#'
#' `RAW_NT` is a named vector of raw bytes allowing fast comparison of IUPAC
#' DNA nucleotide characters. A, C, G, and T are assigned single individual
#' bits, and the standard IUPAC characters are assigned bitwise combinations of
#' these. This way `RAW_NT["A"] & RAW_NT["G"]` is zero, while
#' `RAW_NT["A"] & RAW_NT["R"]` is nonzero. The gap characters - and . are
#' assigned one remaining bit, leaving 3 bits unspecified.
#'
#' ```
#' - TGCA
#' A 0000 0001 01
#' C 0000 0010 02
#' G 0000 0100 04
#' T 0000 1000 08
#'
#' R 0000 0101 05
#' Y 0000 1010 0A
#' S 0000 0110 06
#' W 0000 1001 09
#' K 0000 1100 0C
#' M 0000 0011 03
#'
#' B 0000 1110 0E
#' D 0000 1101 0D
#' H 0000 1011 0B
#' V 0000 0111 07
#'
#' N 0000 1111 0F
#'
#' - 0001 0000 10
#' . 0001 0000 10
#' ```
#' @md
RAW_NT <- as.raw(c(1, 2, 4, 8, 16, 16))
names(RAW_NT) <- c("A", "C", "G", "T", "-", ".")
RAW_NT["R"] <- RAW_NT["A"] | RAW_NT["G"]
RAW_NT["Y"] <- RAW_NT["C"] | RAW_NT["T"]
RAW_NT["S"] <- RAW_NT["G"] | RAW_NT["C"]
RAW_NT["W"] <- RAW_NT["A"] | RAW_NT["T"]
RAW_NT["K"] <- RAW_NT["G"] | RAW_NT["T"]
RAW_NT["M"] <- RAW_NT["A"] | RAW_NT["C"]
RAW_NT["B"] <- RAW_NT["C"] | RAW_NT["G"] | RAW_NT["T"]
RAW_NT["D"] <- RAW_NT["A"] | RAW_NT["G"] | RAW_NT["T"]
RAW_NT["H"] <- RAW_NT["A"] | RAW_NT["C"] | RAW_NT["T"]
RAW_NT["V"] <- RAW_NT["A"] | RAW_NT["C"] | RAW_NT["G"]
RAW_NT["N"] <- RAW_NT["A"] | RAW_NT["C"] | RAW_NT["G"] | RAW_NT["T"]
#' Make matrix of raw bytes from nucleotide sequences
#'
#' Each sequence in the given vector becomes a column of the output matrix, with
#' a row for each position. Shorter sequences are padded with a specific value
#' at bottom of the matrix. With the defaults (see [RAW_NT]), A, C, G, and T
#' (case insensitive) are encoded as 01, 02, 03, and 04, IUPAC codes are bitwise
#' combinations of those values, padding values are 0x80, and any other
#' character is 0x00.
#'
#' @param seqs character vector of nucleotide sequences
#' @param map raw vector with nucleotide names and byte values
#' @param pad raw value to use for missing positions
#' @param other raw value to use for nucleotides not in `map`
#' @param chunksize integer number of sequences to process at a time. For large
#' (hundreds of thousands on up) numbers of input sequences this function can
#' use a lot of memory, but limiting the number of sequences processed at once
#' limits memory usage.
#' @return raw matrix with positions on rows and sequences on columns
#' @md
make_raw_nt <- function(
seqs, map = RAW_NT, pad = 0x80, other = 0x00, chunksize = 8000) {
pad <- as.raw(as.integer(pad))
other <- as.raw(as.integer(other))
if (length(seqs) == 0) {
return(matrix(raw())[0, 0])
}
seqs <- toupper(seqs)
len <- max(nchar(seqs))
idxs <- seq_len(len)
idx_chunks <- seq(1, length(seqs), chunksize)
out <- NULL
for (idx in idx_chunks) {
out_chunk <- do.call(cbind, lapply(
strsplit(seqs[idx:min(length(seqs), (idx + chunksize - 1))], ""),
function(vec) {
# will put NA for too-short seqs, but we need to distinguish that from
# NA from unrecognized NTs
vec <- vec[idxs]
pads <- is.na(vec)
# First set all NA entries to "other" value, but mask the missing cases
# with the pad value
vec <- map[vec]
vec[is.na(names(vec))] <- other
vec[pads] <- pad
unname(vec)
}))
if (is.null(out)) {
out <- out_chunk
} else {
out <- cbind(out, out_chunk)
}
if (length(idx_chunks) > 1) {
# Hadley says you never need to do this.
# http://adv-r.had.co.nz/memory.html
# > Despite what you might have read elsewhere, there’s never any need to
# > call gc() yourself. R will automatically run garbage collection
# > whenever it needs more space.
# But in my testing there is a substantial difference in total memory
# usage observed from the OS if I explicitly force a gc() here at the end
# of each iteration versus just relying on R to decide. I'd rather have
# it take slightly longer and avoid using another GB or more of RAM when
# handling larger files.
gc()
}
}
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.