R/designOligos.R

Defines functions .beautifyOligos .scoreOligos .scoreGcContent .scoreIdentityCoverage .designAmbiguousOligos .filterProbes .checkProbes .designMixedPrimers .filterPrimers .checkPrimers .isValid .convertToMatrices .isWithinRange .makeOligoDf .meanRange .allVariants .repeats .endRuns .gcClamp .reverseComplement .oligoMatrix .expandDegenerates .filterOligos .generateMixedOligos .mergeLists .mixRev .mixFwd .splitAndPaste .generateAmbiguousOligos .countDegeneracy .nmers designOligos

Documented in designOligos

#' Design primers and probes
#'
#' \code{designOligos()} designs oligos (primers and probes)
#' from a consensus profile.
#'
#' @param x An \code{RprimerProfile} object.
#'
#' @param maxGapFrequency
#' Maximum allowed gap frequency at the primer and probe binding sites in
#' the target alignment.
#' A number [0, 1], defaults to \code{0.01}.
#'
#' @param lengthPrimer
#' Primer length range. A numeric vector [15, 30],
#' defaults to \code{c(18, 22)}.
#'
#' @param maxDegeneracyPrimer
#' Maximum number of variants of each primer. A number [1, 256],
#' defaults to \code{4}.
#'
#' @param gcClampPrimer
#' If primers must have a GC clamp.
#' A GC clamp
#' is identified as two to three G or
#' C:s within the last five bases (3' end) of the primer.
#' \code{TRUE} or \code{FALSE}, defaults to \code{TRUE}.
#'
#' @param avoidThreeEndRunsPrimer
#' If primers with more than two runs
#' of the same nucleotide at the terminal 3' end should be avoided.
#' \code{TRUE} or \code{FALSE}, defaults to \code{TRUE}.
#'
#' @param gcPrimer
#' GC-content range for primers.
#' A numeric vector [0, 1], defaults to \code{c(0.40, 0.65)}.
#'
#' @param tmPrimer
#' Tm range for primers (in Celcius degrees).
#' A numeric vector [30, 90], defaults to \code{c(55, 65)}.
#'
#' @param concPrimer
#' Primer concentration in nM, for tm calculation. A number
#' [20, 2000], defaults to \code{500}.
#'
#' @param designStrategyPrimer
#' \code{"ambiguous"} or \code{"mixed"}. Defaults to \code{"ambiguous"}
#' (see details below).
#'
#' @param probe
#' If probes should be designed. \code{TRUE} or \code{FALSE},
#' defaults to \code{TRUE}.
#'
#' @param lengthProbe
#' Probe length range. A numeric vector [15, 40],
#' defaults to \code{c(18, 22)}.
#'
#' @param maxDegeneracyProbe
#' Maximum number of variants of each probe. A number [1, 256],
#' defaults to \code{4}.
#'
#' @param avoidFiveEndGProbe
#' If probes with G
#' at the 5' end should be avoided. \code{TRUE} or \code{FALSE},
#' defaults to \code{TRUE}.
#'
#' @param gcProbe
#' GC-content range for probes. A numeric vector [0, 1],
#' defaults to \code{c(0.40, 0.65)}.
#'
#' @param tmProbe
#' Tm range for probes (in Celcius degrees).
#' A numeric vector [30, 90], defaults to \code{c(55, 70)}.
#'
#' @param concProbe
#' Primer concentration in nM, for tm calculation. A numeric vector
#' [20, 2000], defaults to \code{250}.
#'
#' @param concNa
#' Sodium ion (equivalent) concentration in the PCR reaction (in M).
#' For calculation of tm and delta G.
#' A numeric vector [0.01, 1], defaults to \code{0.05} (50 mM).
#'
#' @return
#' An \code{RprimerOligo} object, containing the following information:
#'
#' \describe{
#'   \item{type}{Whether the oligo is a primer or probe.}
#'   \item{fwd}{\code{TRUE} if the oligo is valid in forward direction,
#'     \code{FALSE} otherwise.}
#'   \item{rev}{\code{TRUE} if the oligo is valid in reverse direction,
#'     \code{FALSE} otherwise.}
#'   \item{start}{Start position of the oligo.}
#'   \item{end}{End positon of the oligo.}
#'   \item{length}{Oligo length.}
#'   \item{iupacSequence}{Oligo sequence in IUPAC format
#'   (i.e. with ambiguous bases).}
#'   \item{iupaSequenceRc}{The reverse complement of the iupacSequence.}
#'   \item{identity}{For ambiguous oligos: average identity of the oligo.
#'     For mixed oligos: average identity of the 5' (consensus) part of the
#'     oligo. The value can range from 0 to 1.}
#'   \item{coverage}{For ambiguous oligos: average coverage of the oligo.
#'     For mixed oligos: average coverage of the 3' (degenerate) part of the
#'     oligo. The value can range from 0 to 1.}
#'   \item{degeneracy}{Number of sequence variants of the oligo.}
#'   \item{gcContentMean}{Mean GC-content of all sequence variants of the oligo.
#'   }
#'   \item{gcContent}{Range in GC-content of all sequence variants of
#'     the oligo.}
#'   \item{tmMean}{Mean tm of all sequence variants of the oligo
#'   (in Celcius degrees).}
#'   \item{tm}{Range in tm of all sequence variants of the oligo
#'   (in Celcius degrees).}
#'   \item{deltaGMean}{Mean delta G of all sequence variants of the oligo
#'   (in kcal/mol).}
#'   \item{deltaG}{Range in delta G of all sequence variants of the oligo
#'   (in kcal/mol).}
#'   \item{sequence}{All sequence variants of the oligo.}
#'   \item{sequenceRc}{Reverse complements of all sequence variants.}
#'   \item{gcContent}{GC-content of all sequence variants.}
#'   \item{tm}{Tm of all sequence variants (in Celcius degrees).}
#'   \item{deltaG}{Delta G of all sequence variants (in kcal/mol).}
#'   \item{method}{Design method used to generate the oligo: "ambiguous",
#'   "mixedFwd" or "mixedRev".}
#'   \item{score}{Oligo score, the lower the better.}
#'   \item{roiStart}{First position of the input \code{RprimerProfile} object
#'     (roi = region of interest).}
#'   \item{roiEnd}{Last position of the input \code{RprimerProfile} object.}
#' }
#'
#' An error message will return if no oligos are found. If so, a good idea
#' could be to re-run the design process with relaxed constraints.
#'
#' @details
#'
#' \strong{Valid oligos}
#'
#' For an oligo to be considered as valid, all sequence variants must fulfill
#' all the specified design constraints.
#'
#' Furthermore, oligos with at least one sequence variant containing
#' more than four consecutive runs
#' of the same
#' nucleotide (e.g. "AAAAA") and/or more than three consecutive runs
#' of the same di-nucleotide (e.g. "TATATATA") will be excluded
#' from consideration.
#'
#' \strong{Calculation of tm and delta G}
#'
#' Melting temperatures
#' are calculated for perfectly matching
#' DNA duplexes using the
#' nearest-neighbor
#' method (SantaLucia and Hicks, 2004), by using the following equation:
#'
#' \loadmathjax
#' \mjsdeqn{Tm = (\Delta H ^o \cdot 1000) / (\Delta S ^o + R \cdot \log [\mathrm{oligo}]) - 273.15}
#'
#' where \mjseqn{\Delta H ^o} is
#' the change in enthalpy (in cal/mol) and \mjseqn{\Delta S ^o} is the
#' change in entropy (in cal/K/mol) when an
#' oligo and a perfectly matching target sequence goes from random coil to
#' duplex formation.
#' \mjseqn{K} is the gas constant (1.9872 cal/mol \emph{K}).
#'
#' Delta G is calculated at 37 Celcius degrees, for when an oligo and a
#' perfectly matching target
#' sequence goes from random coil to duplex state, by using  the following
#' equation:
#'
#' \mjsdeqn{ \Delta G ^o _T = ( \Delta H ^o \cdot 1000 - T \cdot \Delta S ^o ) / 1000}{ASCII representation}
#
#' For both tm and delta G, the following salt correction method is used
#' for \mjseqn{ \Delta S^o }, as
#' described in SantaLucia and Hicks (2004):
#'
#' \mjsdeqn{ \Delta S^o [\mathrm{Na^+}] = \Delta S^o [\mathrm{1 M NaCl}] + 0.368 \cdot N / 2 \cdot \log [\mathrm{Na^+}] }
#'
#' where \mjseqn{N} is the total number of phosphates in the duplex, and [Na+] is
#' the total
#' concentration of monovalent cations.
#'
#' Nearest neighbor table values for \mjseqn{\Delta S^o} and \mjseqn{\Delta H^o} are
#' from SantaLucia and Hicks, 2004, and can be
#' retrieved calling \code{rprimer:::lookup$nn}.
#'
#' \strong{Primer design strategies}
#'
#' Primers can be generated by using one of the two following strategies:
#'
#' \itemize{
#' \item{The \strong{ambiguous strategy} (default) generates primers from the
#' IUPAC consensus sequence alone, which means that ambiguous bases can
#' occur at any position in the primer.}
#'
#' \item{The \strong{mixed strategy} generates primers from both the majority
#' and the IUPAC consensus sequence. These primers consist of a shorter
#' degenerate part at the 3' end (approx. 1/3 of the primer, targeting a
#' conserved
#' region) and a longer consensus part at the 5' end (approx.
#' 2/3 of the primer),
#' which instead of having ambiguous bases contains the most frequently occuring
#' nucleotide at each position.
#' This strategy resembles the widely-adopted Consensus-Degenerate Hybrid
#' Oligonucleotide Primer (CODEHOP) principle (Rose et al., 1998), and aims to
#' to allow amplification of highly variable targets using primers with
#' low degeneracy.
#' The idea is that the degenerate 3' end part will bind specifically to
#' the target sequence in the initial PCR cycles, and promote amplification
#' in spite of eventual mismatches at the 5' consensus part
#' (since 5' end mismatches are generally less detrimental than
#' 3' end mismatches). In this way, the generated products
#' will match the 5' ends of all primers perfectly, which allows them
#' to be efficiently amplified in later PCR cycles.
#' To provide a sufficiently high tm in spite of mismatches, it is
#' recommended to design relatively long primers (at least 25 bases) when using
#' this strategy.}
#' }
#'
#' Probes are always designed using the ambiguous strategy.
#'
#' \strong{Scoring system for oligos}
#'
#' All valid oligos are scored based on their identity, coverage and
#' average GC content. The scoring system is presented below.
#'
#' \strong{Identity and coverage}
#'
#' \tabular{lr}{
#' Value range \tab Score \cr
#' \eqn{(0.99, 1]} \tab 0 \cr
#' \eqn{(0.95, 0.99]} \tab 1 \cr
#' \eqn{(0.90, 0.95]} \tab 2 \cr
#' \eqn{\leq 0.90} \tab 3
#' }
#'
#' \strong{Average GC-content}
#'
#' This score is based on how much
#' the average GC-content deviates from 0.5 (in absolute value).
#'
#' \tabular{lr}{
#' Value range \tab Score \cr
#' \eqn{[0, 0.05)}
#' \tab 0 \cr
#' \eqn{[0.05, 0.1)}
#' \tab 1 \cr
#' \eqn{[0.1,  0.2)}
#' \tab 2 \cr
#' \eqn{\geq 0.2} \tab 3
#' }
#'
#' These scores are summarized. The weight of each individual score is 1, and
#' thus, the lowest and best
#' possible score for an oligo is 0, and the worst possible score is 9.
#'
#' @references
#' Rose, TM., Schultz ER., Henikoff JG., Pietrokovski S.,
#' McCallum CM., and Henikoff S. 1998. Consensus-Degenerate
#' Hybrid Oligonucleotide Primers for
#' Amplification of Distantly Related Sequences. Nucleic Acids Research 26 (7):
#' 1628-35.
#'
#' SantaLucia Jr, J., & Hicks, D. (2004).
#' The thermodynamics of DNA structural motifs.
#' Annu. Rev. Biophys. Biomol. Struct., 33, 415-440.
#'
#' @export
#'
#' @import mathjaxr
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- exampleRprimerProfile
#'
#' ## Design primers and probes with default values
#' designOligos(x)
designOligos <- function(x,
                         maxGapFrequency = 0.01,
                         lengthPrimer = c(18, 22),
                         maxDegeneracyPrimer = 4,
                         gcClampPrimer = TRUE,
                         avoidThreeEndRunsPrimer = TRUE,
                         gcPrimer = c(0.40, 0.65),
                         tmPrimer = c(50, 65),
                         concPrimer = 500,
                         designStrategyPrimer = "ambiguous",
                         probe = TRUE,
                         lengthProbe = c(18, 22),
                         maxDegeneracyProbe = 4,
                         avoidFiveEndGProbe = TRUE,
                         gcProbe = c(0.40, 0.65),
                         tmProbe = c(50, 70),
                         concProbe = 250,
                         concNa = 0.05) {
    if (!methods::is(x, "RprimerProfile")) {
        stop("'x' must be an RprimerProfile object.", call. = FALSE)
    }
    if (!(maxGapFrequency >= 0 && maxGapFrequency <= 1)) {
        stop("'maxGapFrequency' must be from 0 to 1.", call. = FALSE)
    }
    if (!(min(lengthPrimer) >= 15 && max(lengthPrimer) <= 40)) {
        stop("'lengthPrimer' must be from 15 to 40.", call. = FALSE)
    }
    if (!(maxDegeneracyPrimer >= 1 && maxDegeneracyPrimer <= 256)) {
        stop("'maxDegeneracyPrimer' must be from 1 to 256.", call. = FALSE)
    }
    if (!is.logical(gcClampPrimer)) {
        stop("'gcClampPrimer' must be TRUE or FALSE", call. = FALSE)
    }
    if (!is.logical(avoidThreeEndRunsPrimer)) {
        stop("'avoidThreeEndRunsPrimer' must be TRUE or FALSE", call. = FALSE)
    }
    if (!(min(gcPrimer) >= 0 && max(gcPrimer) <= 1)) {
        stop(
            "'gcPrimer' must be from 0 to 1, e.g. c(0.45, 0.65).",
            call. = FALSE
        )
    }
    if (!(min(tmPrimer) >= 20 && max(tmPrimer) <= 90)) {
        stop(
            "'tmPrimer' must be from 20 to 90, e.g. c(55, 60).",
            call. = FALSE
        )
    }
    if (!(concPrimer >= 20 && concPrimer <= 2000)) {
        stop("'concPrimer' must be from 20 to 2000.", call. = FALSE)
    }
    if (!(
        designStrategyPrimer == "ambiguous" || designStrategyPrimer == "mixed")
    ) {
        stop(
            "'designStrategyPrimer' must be either 'ambiguous' or 'mixed'.",
            call. = FALSE
        )
    }
    if (!is.logical(probe)) {
        stop("'probe' must be TRUE or FALSE", call. = FALSE)
    }
    if (!(min(lengthProbe) >= 15 && max(lengthProbe) <= 40)) {
        stop("'lengthProbe' must be from 15 to 40.", call. = FALSE)
    }
    if (!(maxDegeneracyProbe >= 1 && maxDegeneracyProbe <= 256)) {
        stop("'maxDegeneracyProbe' must be from 1 to 256.", call. = FALSE)
    }
    if (!is.logical(avoidFiveEndGProbe)) {
        stop("'avoidFiveEndGProbe' must be TRUE or FALSE", call. = FALSE)
    }
    if (!(min(gcProbe) >= 0 && max(gcProbe) <= 1)) {
        stop(
            "'gcProbe' must be from 0 to 1, e.g. c(0.45, 0.65).",
            call. = FALSE
        )
    }
    if (!(min(tmProbe) >= 20 && max(tmProbe) <= 90)) {
        stop(
            "'tmProbe' must be from 20 to 90, e.g. c(55, 60).",
            call. = FALSE
        )
    }
    if (!(concProbe >= 20 && concProbe <= 2000)) {
        stop("'concProbe' must be from 20 to 2000.", call. = FALSE)
    }
    if (!(concNa >= 0.01 && concNa <= 1)) {
        stop("'concNa' must be from 0.01 to 1.", call. = FALSE)
    }
    lengthPrimer <- seq(min(lengthPrimer), max(lengthPrimer))
    lengthProbe <- seq(min(lengthProbe), max(lengthProbe))
    if (designStrategyPrimer == "mixed") {
        primers <- .designMixedPrimers(
            x,
            maxGapFrequency = maxGapFrequency,
            lengthPrimer = lengthPrimer,
            maxDegeneracyPrimer = maxDegeneracyPrimer,
            gcClampPrimer = gcClampPrimer,
            avoidThreeEndRunsPrimer = avoidThreeEndRunsPrimer,
            gcPrimer = gcPrimer,
            tmPrimer = tmPrimer,
            concPrimer = concPrimer,
            concNa = concNa
        )
        if (nrow(primers) == 0L) {
            stop("No primers were found.", call. = FALSE)
        }
        oligos <- primers
        if (probe) {
            probes <- .designAmbiguousOligos(
                x,
                primer = FALSE,
                lengthProbe = lengthProbe,
                maxDegeneracyProbe = maxDegeneracyProbe,
                avoidFiveEndGProbe = avoidFiveEndGProbe,
                gcProbe = gcProbe,
                tmProbe = tmProbe,
                concProbe = concProbe,
                concNa = concNa
            )
            if (nrow(probes) == 0L) {
                stop("No probes were found.", call. = FALSE)
            }
            oligos <- rbind(oligos, probes)
        }
    } else {
        oligos <- .designAmbiguousOligos(
            x,
            maxGapFrequency = maxGapFrequency,
            primer = TRUE,
            lengthPrimer = lengthPrimer,
            maxDegeneracyPrimer = maxDegeneracyPrimer,
            gcClampPrimer = gcClampPrimer,
            avoidThreeEndRunsPrimer = avoidThreeEndRunsPrimer,
            gcPrimer = gcPrimer,
            tmPrimer = tmPrimer,
            concPrimer = concPrimer,
            probe = probe,
            lengthProbe = lengthProbe,
            maxDegeneracyProbe = maxDegeneracyProbe,
            avoidFiveEndGProbe = avoidFiveEndGProbe,
            gcProbe = gcProbe,
            tmProbe = tmProbe,
            concProbe = concProbe,
            concNa = concNa
        )
        if (nrow(oligos[oligos$type == "primer", ]) == 0L) {
            stop("No primers were found.", call. = FALSE)
        }
        if (probe) {
            if (nrow(oligos[oligos$type == "probe", ]) == 0L) {
                stop("No probes were found.", call. = FALSE)
            }
        }
    }
    oligos <- .scoreOligos(oligos)
    oligos <- .beautifyOligos(oligos)
    RprimerOligo(oligos)
}

# Helpers ======================================================================

#' @noRd
#'
#' @examples
#' .nmers(c("A", "G", "T", "T", "C", "G"), n = 4)
.nmers <- function(x, n) {
    start <- seq_len(length(x) - n + 1)
    end <- start + n - 1
    nmers <- lapply(start, \(i) x[start[[i]]:end[[i]]])
    do.call("rbind", nmers)
}

#' @noRd
#'
#' @examples
#' .countDegeneracy(c("A", "R", "T", "T", "N", "G"))
.countDegeneracy <- function(x) prod(lookup$degeneracy[x])

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .generateAmbiguousOligos(exampleRprimerProfile, lengthOligo = 18)
.generateAmbiguousOligos <- function(x, lengthOligo = 20) {
    oligos <- list()
    oligos$iupacSequence <- .nmers(x$iupac, lengthOligo)
    oligos$start <- seq_len(nrow(oligos$iupacSequence)) + min(x$position) - 1
    oligos$end <- seq_len(
        nrow(oligos$iupacSequence)
    ) + lengthOligo - 1 + min(x$position) - 1
    oligos$length <- rep(lengthOligo, nrow(oligos$iupacSequence))
    oligos$degeneracy <- apply(oligos$iupacSequence, 1, .countDegeneracy)
    oligos$gapFrequency <- apply(.nmers(x$gaps, lengthOligo), 1, max)
    oligos$coverage <- .nmers(x$coverage, lengthOligo) |> rowMeans()
    oligos$identity <- .nmers(x$identity, lengthOligo) |> rowMeans()
    oligos$method <- rep("ambiguous", nrow(oligos$iupacSequence))
    oligos$roiStart <- rep(
        min(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos$roiEnd <- rep(
        max(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos
}

#' @noRd
#'
#' @examples
#' .splitAndPaste(t(matrix(rep(1, 10))), t(matrix(rep(2, 10))))
#' .splitAndPaste(t(matrix(rep(1, 10))), t(matrix(rep(2, 10))), combine = FALSE)
#' .splitAndPaste(t(matrix(rep(1, 10))), t(matrix(rep(2, 10))), rev = TRUE)
.splitAndPaste <- function(first, second, rev = FALSE, combine = TRUE) {
    n <- ncol(first)
    small <- seq_len(as.integer(n / 3))
    large <- seq(small[length(small)] + 1, n)
    if (rev) {
        first <- first[, small, drop = FALSE]
        second <- second[, large, drop = FALSE]
    } else {
        first <- first[, seq_along(large), drop = FALSE]
        second <- second[, small + n - length(small), drop = FALSE]
    }
    if (combine) {
        cbind(first, second)
    } else {
        list(first, second)
    }
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .mixFwd(exampleRprimerProfile, 20)
.mixFwd <- function(x, lengthOligo = 20) {
    oligos <- list()
    majority <- .nmers(x$majority, lengthOligo)
    iupac <- .nmers(x$iupac, lengthOligo)
    oligos$iupacSequence <- .splitAndPaste(majority, iupac)
    oligos$start <- seq_len(nrow(oligos$iupacSequence)) + min(x$position) - 1
    oligos$end <- seq_len(
        nrow(oligos$iupacSequence)
    ) + lengthOligo - 1 + min(x$position) - 1
    oligos$length <- rep(lengthOligo, nrow(oligos$iupacSequence))
    oligos$degeneracy <- apply(oligos$iupacSequence, 1, .countDegeneracy)
    oligos$gapFrequency <- apply(.nmers(x$gaps, lengthOligo), 1, max)
    identityCoverage <- .splitAndPaste(
        .nmers(x$identity, lengthOligo), .nmers(x$coverage, lengthOligo),
        combine = FALSE
    )
    oligos$identity <- identityCoverage[[1]] |> rowMeans()
    oligos$coverage <- identityCoverage[[2]] |> rowMeans()
    oligos$method <- rep("mixedFwd", nrow(oligos$iupacSequence))
    oligos$roiStart <- rep(
        min(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos$roiEnd <- rep(
        max(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .mixRev(exampleRprimerProfile, 20)
.mixRev <- function(x, lengthOligo = 20) {
    oligos <- list()
    majority <- .nmers(x$majority, lengthOligo)
    iupac <- .nmers(x$iupac, lengthOligo)
    oligos$iupacSequence <- .splitAndPaste(iupac, majority, rev = TRUE)
    oligos$start <- seq_len(nrow(oligos$iupacSequence)) + min(x$position) - 1
    oligos$end <- seq_len(
        nrow(oligos$iupacSequence)
    ) + lengthOligo - 1 + min(x$position) - 1
    oligos$length <- rep(lengthOligo, nrow(oligos$iupacSequence))
    oligos$degeneracy <- apply(oligos$iupacSequence, 1, .countDegeneracy)
    oligos$gapFrequency <- apply(.nmers(x$gaps, lengthOligo), 1, max)
    coverageIdentity <- .splitAndPaste(
        .nmers(x$coverage, lengthOligo), .nmers(x$identity, lengthOligo),
        combine = FALSE, rev = TRUE
    )
    oligos$identity <- coverageIdentity[[2]] |> rowMeans()
    oligos$coverage <- coverageIdentity[[1]] |> rowMeans()
    oligos$method <- rep("mixedRev", nrow(oligos$iupacSequence))
    oligos$roiStart <- rep(
        min(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos$roiEnd <- rep(
        max(x$position, na.rm = TRUE), nrow(oligos$iupacSequence)
    )
    oligos
}


#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' fwd <- .mixFwd(exampleRprimerProfile)
#' rev <- .mixRev(exampleRprimerProfile)
#' .mergeLists(fwd, rev)
.mergeLists <- function(first, second) {
    x <- lapply(names(first), \(i) {
        if (is.matrix(first[[i]])) {
            rbind(first[[i]], second[[i]])
        } else {
            c(first[[i]], second[[i]])
        }
    })
    names(x) <- names(first)
    x
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .generateMixedOligos(exampleRprimerProfile, lengthOligo = 20)
.generateMixedOligos <- function(x, lengthOligo = 20) {
    fwd <- .mixFwd(x, lengthOligo)
    rev <- .mixRev(x, lengthOligo)
    .mergeLists(fwd, rev)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .generateMixedOligos(exampleRprimerProfile)
#' .filterOligos(x)
.filterOligos <- function(x, maxGapFrequency = 0.1, maxDegeneracy = 4) {
    invalidCharacters <- apply(x$iupacSequence, 1, \(x) {
        any(x == "-") | any(is.na(x))
    })
    invalid <- unique(c(
        which(x$degeneracy > maxDegeneracy),
        which(x$gapFrequency > maxGapFrequency),
        which(invalidCharacters)
    ))
    if (length(invalid) > 0L) {
        x <- lapply(x, \(x) {
            if (is.matrix(x)) x[-invalid, , drop = FALSE] else x[-invalid]
        })
    }
    x
}

#' @noRd
#'
#' @examples
#' .expandDegenerates(c("A", "R", "T", "T", "N", "G"))
.expandDegenerates <- function(x) {
    bases <- lapply(x, \(i) {
        degen <- unname(lookup$degenerates[[i]])
        unlist(strsplit(degen, split = ","))
    })
    all <- expand.grid(bases[seq_along(bases)], stringsAsFactors = FALSE)
    all <- as.matrix(all)
    colnames(all) <- NULL
    all
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .filterOligos(.generateMixedOligos(exampleRprimerProfile))
#' x$sequence <- apply(x$iupacSequence, 1, .expandDegenerates)
#' .oligoMatrix(x$sequence)
.oligoMatrix <- function(x) {
    degeneracy <- vapply(x, nrow, integer(1L))
    id <- lapply(seq_along(degeneracy), \(x) rep(x, degeneracy[[x]]))
    id <- unlist(id)
    x <- do.call("rbind", x)
    rownames(x) <- id
    x
}

#' @noRd
#'
#' @examples
#' .reverseComplement(matrix(c("A", "R", "T", "T", "N", "G")))
.reverseComplement <- function(x) {
    rc <- x[, rev(seq_len(ncol(x))), drop = FALSE]
    rc[] <- lookup$complement[rc]
    rc
}

#' @noRd
#'
#' @examples
#' seq <- matrix(c(1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0))
#' .gcClamp(gc, rev = FALSE)
#' .gcClamp(gc, rev = TRUE)
.gcClamp <- function(x, rev = FALSE) {
    if (rev) {
        end <- x[, seq_len(5), drop = FALSE]
        5 - rowSums(end) >= 2 & 5 - rowSums(end) <= 3 ## Because of complement..
    } else {
        end <- x[, seq((ncol(x) - 4), ncol(x)), drop = FALSE]
        rowSums(end) >= 2 & rowSums(end) <= 3
    }
}

#' @noRd
#'
#' @examples
#' seq <- matrix(c("A", "C", "G", "G", "T", "T", "A", "A"))
#' .endRuns(seq, rev = FALSE)
.endRuns <- function(x, rev = FALSE) {
    if (rev) {
        end <- x[, seq_len(3), drop = FALSE]
    } else {
        end <- x[, seq((ncol(x) - 2), ncol(x)), drop = FALSE]
    }
    apply(end, 1, \(x) {
        all(x == "A") | all(x == "C") | all(x == "T") | all(x == "G")
    })
}

#' @noRd
#'
#' @examples
#' .repeats(c("ACTTTTCT", "ACTTTTTCT", "ATCTCTCTCA"))
.repeats <- function(x) {
    di <- "(AT){4,}|(TA){4,}|(AC){4,}|(CA){4,}|(AG){4,}|(GA){4,}|(GT){4,}|(TG){4,}|(CG){4,}|(GC){4,}|(CT){4,}|(TC){4,}|)"
    mono <- "([A-Z])\\1\\1\\1\\1"
    vapply(x, \(y) {
        grepl(di, y) | grepl(mono, y)
    }, logical(1L))
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .filterOligos(.generateMixedOligos(exampleRprimerProfile))
#' .allVariants(x)
.allVariants <- function(x,
                         concPrimer = 500,
                         concProbe = 250,
                         concNa = 0.05) {
    all <- list()
    all$sequence <- apply(x$iupacSequence, 1, .expandDegenerates, simplify = FALSE)
    ## If there is only one variant of each oligo,
    ## (and apply returns a matrix instead of a list):
    if (is.matrix(all$sequence)) {
        all$sequence <- t(all$sequence)
        all$sequence <- lapply(seq_len(nrow(all$sequence)), \(i) {
            all$sequence[i, , drop = FALSE]
        })
    }
    all$sequence <- .oligoMatrix(all$sequence)
    all$sequenceRc <- .reverseComplement(all$sequence)
    gc <- all$sequence == "C" | all$sequence == "G"
    n <- rowSums(
        all$sequence == "A" | all$sequence == "C" |
            all$sequence == "G" | all$sequence == "T"
    )
    all$gcContent <- rowSums(gc) / n
    all$gcClampFwd <- .gcClamp(gc)
    all$gcClampRev <- .gcClamp(gc, rev = TRUE)
    all$threeEndRunsFwd <- .endRuns(all$sequence)
    all$threeEndRunsRev <- .endRuns(all$sequence, rev = TRUE)
    all$fiveEndGPlus <- all$sequence[, 1] == "G"
    all$fiveEndGMinus <- all$sequence[, ncol(all$sequence)] == "C"
    tmParam <- .tmParameters(all$sequence, concNa)
    all$tmPrimer <- .tm(tmParam, concPrimer)
    all$tmProbe <- .tm(tmParam, concProbe)
    all$deltaG <- .deltaG(tmParam)
    all$sequence <- apply(all$sequence, 1, paste, collapse = "")
    all$sequenceRc <- apply(all$sequenceRc, 1, paste, collapse = "")
    all$repeats <- .repeats(all$sequence)
    lapply(all, \(x) unname(split(unname(x), f = as.integer(names(x)))))
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .allVariants(.filterOligos(.generateMixedOligos(exampleRprimerProfile)))
#' .meanRange(x)
.meanRange <- function(x) {
    x <- x[c("gcContent", "tmPrimer", "tmProbe", "deltaG")]
    means <- lapply(x, \(y) {
        vapply(y, \(z) {
            sum(z) / length(z)
        }, double(1L))
    })
    means <- do.call("cbind.data.frame", means)
    names(means) <- paste0(names(means), "Mean")
    ranges <- lapply(x, \(y) {
        vapply(y, \(z) {
            max(z) - min(z)
        }, double(1L))
    })
    ranges <- do.call("cbind.data.frame", ranges)
    names(ranges) <- paste0(names(ranges), "Range")
    cbind(means, ranges)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .filterOligos(.generateMixedOligos(exampleRprimerProfile))
#' .makeOligoDf(x)
.makeOligoDf <- function(x) {
    x <- within(x, rm("gapFrequency"))
    x$iupacSequenceRc <- .reverseComplement(x$iupacSequence)
    x$iupacSequence <- apply(x$iupacSequence, 1, paste, collapse = "")
    x$iupacSequenceRc <- apply(x$iupacSequenceRc, 1, paste, collapse = "")
    do.call("cbind.data.frame", x)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .allVariants(.filterOligos(.generateMixedOligos(exampleRprimerProfile)))
#' .isWithinRange(x$gcContent, c(0.4, 0.6))
.isWithinRange <- function(x, range) {
    lapply(x, \(y) y >= min(range) & y <= max(range))
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .allVariants(.filterOligos(.generateMixedOligos(exampleRprimerProfile)))
#' gcInRange <- .isWithinRange(x$gcContent, c(0.4, 0.6))
#' x <- data.frame(cbind(gcInRange))
#' .convertToMatrices(x["gcInRange"])
.convertToMatrices <- function(x) {
    lapply(seq_len(nrow(x)), \(i) {
        y <- lapply(x[i, , drop = FALSE], unlist)
        do.call("cbind", y)
    })
}

#' Column means represent the proportion
#' of sequence variants that fulfill a specific criteria (e.g. GC clamp),
#' and row means represent the proportion of the desired design criteria that
#' are fulfilled by specific sequence variants.
#'
#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' x <- .allVariants(.filterOligos(.generateMixedOligos(exampleRprimerProfile)))
#' gcInRange <- .isWithinRange(x$gcContent, c(0.4, 0.6))
#' x <- data.frame(cbind(gcInRange))
#' check <- .convertToMatrices(x["gcInRange"])
#' .isValid(check, rowThreshold = 0.5, colThreshold = 0.5)
.isValid <- function(x, rowThreshold = 1, colThreshold = 1) {
    valid <- vapply(x, \(y) {
        toInvert <- c(
            "repeats", "threeEndRunsFwd", "threeEndRunsRev",
            "fiveEndGPlus", "fiveEndGMinus"
        )
        select <- colnames(y) %in% toInvert
        y[, select] <- !y[, select]
        col <- colMeans(y)
        row <- rowMeans(y)
        all(col >= colThreshold) & all(row >= rowThreshold)
    }, logical(1L))
    valid
}

#' @noRd
.checkPrimers <- function(x,
                          gcClampPrimer = TRUE,
                          avoidThreeEndRunsPrimer = TRUE) {
    selectFwd <- c("repeats", "tmInRange", "gcInRange")
    selectRev <- c("repeats", "tmInRange", "gcInRange")
    if (gcClampPrimer) {
        selectFwd <- c(selectFwd, "gcClampFwd")
        selectRev <- c(selectRev, "gcClampRev")
    }
    if (avoidThreeEndRunsPrimer) {
        selectFwd <- c(selectFwd, "threeEndRunsFwd")
        selectRev <- c(selectRev, "threeEndRunsRev")
    }
    fwdPrimers <- x[selectFwd]
    fwdPrimers <- .convertToMatrices(fwdPrimers)
    fwd <- .isValid(fwdPrimers)
    revPrimers <- x[selectRev]
    revPrimers <- .convertToMatrices(revPrimers)
    rev <- .isValid(revPrimers)
    x <- cbind(x, fwd, rev)
    x[x$fwd | x$rev, , drop = FALSE]
}

#' @noRd
.filterPrimers <- function(x,
                           lengthPrimer = 18:22,
                           maxDegeneracyPrimer = 4,
                           gcClampPrimer = TRUE,
                           avoidThreeEndRunsPrimer = TRUE,
                           gcPrimer = c(0.45, 0.55),
                           tmPrimer = c(55, 65)) {
    x <- x[x$length %in% lengthPrimer, , drop = FALSE]
    x <- x[x$degeneracy <= maxDegeneracyPrimer, , drop = FALSE]
    gcInRange <- .isWithinRange(x$gcContent, gcPrimer)
    tmInRange <- .isWithinRange(x$tmPrimer, tmPrimer)
    x <- cbind(x, data.frame(cbind(tmInRange, gcInRange)))
    x <- .checkPrimers(
        x,
        gcClampPrimer,
        avoidThreeEndRunsPrimer
    )
    x$rev[x$method == "mixedFwd" & x$rev] <- FALSE
    x$fwd[x$method == "mixedRev" & x$fwd] <- FALSE
    x <- x[x$fwd | x$rev, , drop = FALSE]
    remove <- c(
        "gcInRange", "tmInRange",
        "okFwd", "okRev", "tmProbeMean", "tmProbeRange", "tmProbe"
    )
    x <- x[!names(x) %in% remove]
    oldnames <- c("tmPrimerMean", "tmPrimerRange", "tmPrimer")
    newnames <- c("tmMean", "tmRange", "tm")
    names(x)[names(x) %in% oldnames] <- newnames
    type <- rep("primer", nrow(x))
    cbind(type, x)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .designMixedPrimers(exampleRprimerProfile)
.designMixedPrimers <- function(x,
                                maxGapFrequency = 0.01,
                                lengthPrimer = 18:22,
                                maxDegeneracyPrimer = 4,
                                gcClampPrimer = TRUE,
                                avoidThreeEndRunsPrimer = TRUE,
                                gcPrimer = c(0.45, 0.55),
                                tmPrimer = c(55, 65),
                                concPrimer = 500,
                                concNa = 0.05) {
    lengthPrimer <- lengthPrimer[order(lengthPrimer)]
    all <- lapply(lengthPrimer, \(i) {
        mixed <- .generateMixedOligos(x, lengthOligo = i)
        mixed <- .filterOligos(
            mixed, maxGapFrequency, maxDegeneracyPrimer
        )
        if (length(mixed[[1]] > 0L)) {
            allVariants <- .allVariants(
                mixed, concPrimer,
                concProbe = NA, concNa
            )
            meansAndRanges <- .meanRange(allVariants)
            allVariants <- data.frame(do.call("cbind", allVariants))
            mixed <- .makeOligoDf(mixed)
            cbind(mixed, meansAndRanges, allVariants)
        } else {
            NULL
        }
    })
    all <- do.call("rbind", all)
    if (is.null(all)) {
        stop("No primers were found.", call. = FALSE)
    }
    all <- .filterPrimers(
        all,
        lengthPrimer,
        maxDegeneracyPrimer,
        gcClampPrimer,
        avoidThreeEndRunsPrimer,
        gcPrimer,
        tmPrimer
    )
    all
}

#' @noRd
.checkProbes <- function(x,
                         avoidFiveEndGProbe) {
    selectFwd <- c("repeats", "tmInRange", "gcInRange")
    selectRev <- c("repeats", "tmInRange", "gcInRange")
    if (avoidFiveEndGProbe) {
        selectFwd <- c(selectFwd, "fiveEndGPlus")
        selectRev <- c(selectRev, "fiveEndGMinus")
    }
    fwdProbes <- x[selectFwd]
    fwdProbes <- .convertToMatrices(fwdProbes)
    fwd <- .isValid(fwdProbes)
    revProbes <- x[selectRev]
    revProbes <- .convertToMatrices(revProbes)
    rev <- .isValid(revProbes)
    x <- cbind(x, fwd, rev)
    x[x$fwd | x$rev, , drop = FALSE]
}

#' @noRd
.filterProbes <- function(x,
                          lengthProbe = 18:22,
                          maxDegeneracyProbe = 4,
                          avoidFiveEndGProbe = TRUE,
                          gcProbe = c(0.45, 0.55),
                          tmProbe = c(55, 65)) {
    x <- x[x$length %in% lengthProbe, , drop = FALSE]
    x <- x[x$degeneracy <= maxDegeneracyProbe, , drop = FALSE]
    gcInRange <- .isWithinRange(x$gcContent, gcProbe)
    tmInRange <- .isWithinRange(x$tmProbe, tmProbe)
    x <- cbind(x, data.frame(cbind(tmInRange, gcInRange)))
    x <- .checkProbes(
        x,
        avoidFiveEndGProbe
    )
    remove <- c(
        "gcInRange", "tmInRange", "tmPrimerMean",
        "tmPrimerRange", "tmPrimer"
    )
    x <- x[!names(x) %in% remove]
    oldnames <- c("tmProbeMean", "tmProbeRange", "tmProbe")
    newnames <- c("tmMean", "tmRange", "tm")
    names(x)[names(x) %in% oldnames] <- newnames
    type <- rep("probe", nrow(x))
    cbind(type, x)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerProfile")
#' .designAmbiguousOligos(exampleRprimerProfile)
.designAmbiguousOligos <- function(x,
                                   maxGapFrequency = 0.01,
                                   primer = TRUE,
                                   lengthPrimer = 18:22,
                                   maxDegeneracyPrimer = 4,
                                   gcClampPrimer = TRUE,
                                   avoidThreeEndRunsPrimer = TRUE,
                                   gcPrimer = c(0.40, 0.65),
                                   tmPrimer = c(50, 65),
                                   concPrimer = 500,
                                   probe = TRUE,
                                   lengthProbe = 18:22,
                                   maxDegeneracyProbe = 4,
                                   avoidFiveEndGProbe = TRUE,
                                   gcProbe = c(0.40, 0.65),
                                   tmProbe = c(50, 70),
                                   concProbe = 250,
                                   concNa = 0.05) {
    if (probe && primer) {
        lengthOligo <- unique(c(lengthPrimer, lengthProbe))
        maxDegeneracy <- max(c(maxDegeneracyPrimer, maxDegeneracyProbe))
    } else if (!probe && primer) {
        lengthOligo <- lengthPrimer
        maxDegeneracy <- maxDegeneracyPrimer
    } else if (probe && !primer) {
        lengthOligo <- lengthProbe
        maxDegeneracy <- maxDegeneracyProbe
    }
    lengthOligo <- lengthOligo[order(lengthOligo)]
    ambiguous <- lapply(lengthOligo, \(i) {
        amb <- .generateAmbiguousOligos(x, lengthOligo = i)
        amb <- .filterOligos(
            amb,
            maxGapFrequency = maxGapFrequency,
            maxDegeneracy = maxDegeneracy
        )
        if (length(amb[[1]] > 0L)) {
            allVariants <- .allVariants(
                amb,
                concPrimer = concPrimer,
                concProbe = concProbe,
                concNa = concNa
            )
            meanAndRange <- .meanRange(allVariants)
            allVariants <- data.frame(do.call("cbind", allVariants))
            amb <- .makeOligoDf(amb)
            cbind(amb, meanAndRange, allVariants)
        } else {
            NULL
        }
    })
    ambiguous <- do.call("rbind", ambiguous)
    if (is.null(ambiguous)) {
        stop("No primers were found.", call. = FALSE)
    }
    if (primer) {
        primers <- .filterPrimers(
            ambiguous,
            lengthPrimer,
            maxDegeneracyPrimer,
            gcClampPrimer,
            avoidThreeEndRunsPrimer,
            gcPrimer,
            tmPrimer
        )
    } else {
        primers <- NULL
    }
    if (probe) {
        probes <- .filterProbes(
            ambiguous,
            lengthProbe,
            maxDegeneracyProbe,
            avoidFiveEndGProbe,
            gcProbe,
            tmProbe
        )
    } else {
        probes <- NULL
    }
    rbind(primers, probes)
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerOligo")
#' x <- head(exampleRprimerOligo$identity)
#' .scoreIdentityCoverage(x)
.scoreIdentityCoverage <- function(x) {
    score <- vector(mode = "double", length = length(x))
    score[x <= 1 & x > 0.99] <- 0
    score[x <= 0.99 & x > 0.95] <- 1
    score[x <= 0.95 & x > 0.90] <- 2
    score[x <= 0.90] <- 3
    score
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerOligo")
#' x <- head(exampleRprimerOligo$gcContentMean)
#' .scoreGcContent(x)
.scoreGcContent <- function(x) {
    deviation <- abs(x - 0.5)
    score <- vector(mode = "double", length = length(deviation))
    score[deviation >= 0 & deviation < 0.05] <- 0
    score[deviation >= 0.05 & deviation < 0.1] <- 1
    score[deviation >= 0.1 & deviation < 0.2] <- 2
    score[deviation >= 0.2] <- 3
    score
}

#' @noRd
#'
#' @examples
#' data("exampleRprimerOligo")
#' x <- head(exampleRprimerOligo)
#' .scoreOligos(x)
.scoreOligos <- function(x) {
    score <- list()
    score$identity <- .scoreIdentityCoverage(x$identity)
    score$coverage <- .scoreIdentityCoverage(x$coverage)
    score$gcContent <- .scoreGcContent(x$gcContentMean)
    score <- do.call("cbind", score)
    score <- rowSums(score)
    cbind(x, score)
}

#' @noRd
.beautifyOligos <- function(x) {
    keep <- c(
        "type", "fwd", "rev", "start", "end", "length",
        "iupacSequence", "iupacSequenceRc", "identity",
        "coverage", "degeneracy", "gcContentMean", "gcContentRange",
        "tmMean", "tmRange", "deltaGMean", "deltaGRange", "sequence",
        "sequenceRc", "gcContent", "tm", "deltaG", "method", "score",
        "roiStart", "roiEnd"
    )
    x <- x[keep]
    x <- x[order(x$start), ]
    rownames(x) <- NULL
    x
}
sofpn/rprimer documentation built on July 2, 2023, 7:15 a.m.