R/primers.R

Defines functions get_covered.vanilla get_cvg_ratio get.analysis.mode write_primers pair_primers disambiguate.primers read_primers.internal read_primers_csv set.run.names read_primers_multiple read_primers_single read_primers create_region_boxes update_primer_cvg rbind.Primers cbind.Primers my_show_df Primers check_restriction_sites_single restriction_match restriction_ali restriction_hits update_primer_binding_regions check_restriction_sites validate_primers greedy.primers classify_design_problem estimate.cvg estimate.cvg.dir create.k.mers create.kmer

Documented in check_restriction_sites classify_design_problem get_cvg_ratio Primers read_primers write_primers

##########################
# Primer functionalities
##########################
#' Creation of k-mers of a single sequence.
#' @param seq A character vector.
#' @param k The size of the k-mer.
#' @return A names character vector, where the names 
#' are the relative positions of the k-mers and the values
#' give the character vector of the k-mer.
#' @keywords internal
create.kmer <- function(seq, k) {
    # generates all unique substrings of length k contained in the single sequence 'seq' 
    if (length(seq) != 1) {
        stop("This function is only for single sequences")
    }
    if (k > nchar(seq)) {
        k <- nchar(seq)
        warning("Specified k-mer size was larger than sequence length; ",
                "Reduced k-mer size to: ", k, ".")
    }
    k.mers <- unique(substring(seq, seq_len(nchar(seq) - k + 1), k:nchar(seq)))
    kmer.pos <- -seq(nchar(seq), k)
    names(k.mers) <- kmer.pos
    return(k.mers)
}
#' Creation of k-mers for multiple sequences.
#' @param seq A character vector.
#' @param k The size of the k-mer.
#' @return A list with named character vectors, containing the k-mers.
#' @keywords internal
create.k.mers <- function(seqs, k) {
    # generates all substrings of length k contained in the single sequence 'seq' 
    i <- NULL
    k.mers <- foreach (i = seq_along(seqs), .combine = c) %dopar% {
        k.mer <- list(create.kmer(seqs[i], k))
    }
    return(k.mers)
}
#' Estimation of Primer Coverage.
#'
#' Estimates the possible coverage of primers using
#' probes of size \code{k} and only considering perfect
#' matches without consideration of ambiguities.
#'
#' @param seqs A character vector of sequences to evaluate coverage for.
#' @param k A numeric giving the size of the primers.
#' @param id An optional identifier for the primers.
#' @return A data frame with binding information.
#' @keywords internal
estimate.cvg.dir <- function(seqs, k, id = "") {
    # 1. create all k-mers for input seqs
    kmers <- create.k.mers(seqs, k)
    # since we're creating before the target region, k mer pos is negative
    kmer.df <- do.call(rbind, lapply(seq_along(kmers), function(x) data.frame("Template_ID" = x, "Primer" = kmers[[x]], "Start" = as.numeric(names(kmers[[x]])), "End" = as.numeric(names(kmers[[x]])) + k)))
    # use ddply to determine binding positions / cvg
    binding.df <- ddply(kmer.df, "Primer", summarize, 
                    "Start" = paste(substitute(Start), collapse = ","), 
                    "End" = paste(substitute(End), collapse = ","),
                    "Covered_Seqs" = paste(substitute(Template_ID), collapse = ","),
                    "primer_coverage" = length(substitute(Template_ID)))
    binding.df$Coverage <- unlist(lapply(strsplit(binding.df$Start, split = ","), function(x) length(x)))
    binding.df$Coverage_Ratio <- binding.df$Coverage / length(seqs)
    return(binding.df)
}
#' Estimation of Primer Coverage.
#'
#' Estimates the possible coverage of primers using
#' probes of size \code{k} and only considering perfect
#' matches without consideration of ambiguities.
#'
#' @param seqs A character vector of sequences to evaluate coverage for.
#' @param k A numeric giving the size of the primers.
#' @param mode.directionality Estimation of coverage for forward/reverse/both?
#' @param sample An optional identifier for the sample.
#' @return A list with entries \code{fw} and \code{rev} giving
#' data frames for forward/reverse binding.
#' @keywords internal
estimate.cvg <- function(lex.df, k = 18, mode.directionality, sample = "") {
    cvg.fw <- NULL
    cvg.rev <- NULL
    if (mode.directionality == "fw") {
        # seqs <- lex.df$Allowed_fw
        cvg.fw <- estimate.cvg.dir(lex.df$Allowed_fw, k, id = paste0(sample, "_", "fw"))
    } else if (mode.directionality == "rev") {
        cvg.rev <- estimate.cvg.dir(lex.df$Allowed_rev, k, id = paste0(sample, "_", "rev"))
    } else {
        cvg.fw <- estimate.cvg.dir(lex.df$Allowed_fw, k, id = paste0(sample, "_", "fw"))
        cvg.rev <- estimate.cvg.dir(lex.df$Allowed_rev, k, id = paste0(sample, "_", "rev"))
    }
    return(list("fw" = cvg.fw, "rev" = cvg.rev))
}

#' @rdname PrimerDesign
#' @name PrimerDesign
#' @aliases classify_design_problem
#' @details
#' \code{classify_design_problem} determines the difficulty of a primer design 
#' task by estimating the distribution of coverage ratios per primer
#' by performing exact string matching with 
#' primers of length \code{primer.length}, which are constructed
#' by extracting template subsequences. Next, a beta distribution
#' is fitted to the estimated coverage distribution, which is
#' then compare to reference distributions representing
#' primer design problems of different difficulties via the
#' total variance distance. The difficulty of the input primer design
#' problem is found by selecting the class of the 
#' reference distributions that has the smallest distance
#' to the estimated coverage distribution.
#' An estimate of the required number of primers to reach a given
#' \code{required.cvg} can be computed by setting
#' \code{primer.estimate} to \code{TRUE}. Since this estimate
#' is based solely on perfect matching primers, the number of
#' primers that would actually be required is typically less.
#'
#' @return \code{classify_design_problem} returns a list with the 
#' following fields:
#' \describe{
#' \item{\code{Classification}}{The estimated difficulty of the primer design task.}
#' \item{\code{Class-Distances}}{The total variance distance of the fitted
#' beta distribution to the reference distribution.}
#' \item{\code{Confidence}}{The confidence in the estimate of the
#' design tasks' difficulty as based on the class distances.}
#' \item{\code{Uncertain}}{Whether the classification is highly uncertain, that is
#' low-confidence.}
#' \item{\code{Nbr_primers_fw} and \code{Nbr_primers_rev}}{The respective number of 
#' required forward and reverse primers if \code{primer.estimate} was set to \code{TRUE}.}
#' }
#' @export
#' @examples
#'
#' # Classify the difficulty of a primer design task
#' data(Ippolito)
#' design.estimate <- classify_design_problem(template.df[1:30,])
#' # Estimate the number of required primers to amplify the first 5 templates
#' design.estimate.nbr <- classify_design_problem(template.df[1:5,], mode.directionality = "fw",
#'                          primer.length = 20, primer.estimate = TRUE)
classify_design_problem <- function(template.df, 
                                    mode.directionality = c("both", "fw", "rev"),
                                    primer.length = 18, 
                                    primer.estimate = FALSE,
                                    required.cvg = 1) {
    if (!is(template.df, "Templates")) {
        stop("Please supply a valid template data frame")
    }
    mode.directionality <- match.arg(mode.directionality)
    # 1. Estimate lower bound of possible primer coverage ratios
    cvg.df <- estimate.cvg(template.df, k = primer.length, mode.directionality)
    # 2. Fit a Beta distribution to the coverage ratios
    x <- c(cvg.df$fw$Coverage_Ratio, cvg.df$rev$Coverage_Ratio)
    fit.beta <- try(fitdistrplus::fitdist(x, "beta")) # beta has high error
    if (is(fit.beta, "try-error")) {
        # if there's too few unique x values, fitdistrplus won't be able to find a fit -> stop here!
        my.warning("ProblemEstimationProblem", "Could not estimate problem difficulty, problably because the estimated coverage distribution was too narrow.")
        return(NULL)
    }
    fit <- distr::Beta(fit.beta$estimate[1], fit.beta$estimate[2])
    #print("fit is: ")
    #print(fit.beta)
    #hist(rbeta(10000, shape1 = fit.beta$estimate[1], shape2 = fit.beta$estimate[2]))
    # 3. Compare the beta distribution to the reference distributions
    dists <- unlist(lapply(cvg.ref.dists, function(x) {
                    distrEx::TotalVarDist(x, fit)
    }))
    # 4. Classify
    best.idx <- which.min(dists)
    names(dists) <- names(cvg.ref.dists)
    classification <- names(cvg.ref.dists)[best.idx]
    # 5. Confidence of classification
    if (classification %in% c("very_easy", "easy")) {
        check.cols <- c("hard", "very_hard")
    } else if (classification %in% c("very_hard", "hard")) {
        check.cols <- c("easy", "very_easy")
    } else {
       check.cols <- c("easy", "very_easy", "hard", "very_hard") 
    }
    other.dist <- mean(dists[names(dists) %in% check.cols]) # mean of completely different distributions
    selected.dist <- dists[best.idx] # lowest distance
    confidence <- 1 - (selected.dist / other.dist)
    # refuse to give a classification if the standard error was too high, we're too uncertain of the real distribution in this case
    std.cutoff <- 10
    uncertain.class <- FALSE
    if (any(fit.beta$sd > std.cutoff)) {
        warning("Fit of distribution had a maximal standard error of: ",
                max(fit.beta$sd), ". Classification may be highly uncertain.")
        uncertain.class <- TRUE
    }
    nbr.fw.primers <- NA
    nbr.rev.primers <- NA
    if (primer.estimate) {
        # Identify the required nbr of primers
        primers.fw <- NULL
        primers.rev <- NULL
        if (mode.directionality == "fw") {
            primers.fw <- greedy.primers(cvg.df$fw, template.df, required.cvg)
        } else if (mode.directionality == "rev") {
            primers.rev <- greedy.primers(cvg.df$rev, template.df, required.cvg)
        } else {
            primers.fw <- greedy.primers(cvg.df$fw, template.df, required.cvg)
            # adjust rev requirement by forward cvg -> should capture the same templates!
            cvg.idx <- unique(unlist(covered.seqs.to.idx(primers.fw$Covered_Seqs, template.df)))
            required.nbr <- required.cvg * nrow(template.df)
            new.required.cvg <- min(length(cvg.idx) / required.nbr, 1)
            new.template.df <- template.df[cvg.idx[order(cvg.idx)],]
            primers.rev <- greedy.primers(cvg.df$rev, new.template.df, new.required.cvg)
        }
        if (length(primers.fw) != 0) {
            nbr.fw.primers <- nrow(primers.fw)
        } 
        if (length(primers.rev) != 0) {
            nbr.rev.primers <- nrow(primers.rev)
        } 
    }
    # Prepare output:
    result <- list("Classification" = classification,
                   "Class-Distances" = dists,
                   "Confidence" = confidence,
                   "Uncertain" = uncertain.class,
                   "Nbr_primers_fw" = nbr.fw.primers,
                   "Nbr_primers_rev" = nbr.rev.primers)
    return(result)
}
greedy.primers <- function(binding.df, template.df, required.cvg = 1) {
    # lower bound on the number of required primers
    o <- order(binding.df$Coverage_Ratio, decreasing = TRUE)
    binding.df <- binding.df[o,]
    selected.primers <- NULL
    cur.cvg <- 0
    while ((cur.cvg / nrow(template.df)) < required.cvg && nrow(binding.df) > 0) {
        best.covered <- as.numeric(unlist(strsplit(binding.df[1, "Covered_Seqs"], split = ",")))
        if (length(best.covered) == 0) {
            break
        }
        selected.primers <- my_rbind(selected.primers, binding.df[1,])
        marginal.cvg.gain <- length(best.covered)
        cur.cvg <- cur.cvg + marginal.cvg.gain
        binding.df <- binding.df[-1, ]  # remove the selected primer and selection candidates that were filtered this round
        binding.df <- evaluate.diff.primer.cvg(binding.df, best.covered, template.df)  
    }
    return(selected.primers)
}
#' Validates a Primers Object.
#'
#' Checks whether a Primers object is valid or not.
#'
#' @param object An input data frame to be checked for being a primer data frame.
#' @return \code{TRUE}, if the object is valid, FALSE otherwise.
#' @keywords internal
validate_primers <-  function(object) {
    # specify minimal set of columns that should be present in a primer data frame:
    required.fields <- list("Identifier" = c("factor"), "ID" = "factor", 
                   "Forward" = "character", "Reverse" = "character",
                   "primer_length_fw" = c("integer", "numeric"), 
                   "primer_length_rev" = c("integer", "numeric"), 
                   "Direction" = "character",
                   "Run" = "character")
    possible.fields <- NULL # don't check for additional fields here
    if (!is(object, "data.frame")) {
        return("Input was no data frame.")
    }
    check.fields <- check_setting(possible.fields, object, required.fields)
    if (check.fields) {
        # Check that "Run" is unique
        if (length(unique(object$Run)) <= 1) {
            return(TRUE)
        } else {
            msg <- paste0("The 'Run' column may only contain a single unique value. Observed unique values were: ", paste0(unique(object$Run), collapse = ","))
            return(msg)
        }
    } else {
        return(check.fields)
    }
}

#' @rdname PrimerEval
#' @name PrimerEval
#' @aliases check_restriction_sites
#' @return \code{check_restriction_sites} returns a data frame 
#' with possible restriction sites found in the primers.
#' @references
#' Roberts, R.J., Vincze, T., Posfai, J., Macelis, D. (2010) REBASE–a database for DNA restriction
#' and modification: enzymes, genes and genomes. Nucl. Acids Res. 38: D234-D236. http://rebase.neb.com
#' @export
#' @examples
#' 
#' data(Ippolito)
#' # Check the first primer for restriction sites with respect to the first 10 templates
#' site.df <- check_restriction_sites(primer.df[1,], template.df[1:10])
check_restriction_sites <- function(primer.df, template.df, 
                            adapter.action = c("warn", "rm"), 
                            selected = NULL, only.confident.calls = TRUE,
                            updateProgress = NULL) {
    if (length(template.df) == 0 || nrow(template.df) == 0 || 
        !is(template.df, "Templates")) {
        stop("Need correct templates to identify restriction sites.") 
    }
    if (length(primer.df) == 0 || nrow(primer.df) == 0 || 
        !is(primer.df, "Primers")) {
            stop("Need valid primers.")
    }
    adapter.action <- match.arg(adapter.action)
    fw.idx <- which(nchar(primer.df$Forward) != 0)
    rev.idx <- which(nchar(primer.df$Reverse) != 0)
    fw.sites <- NULL
    rev.sites <- NULL
    if (length(fw.idx) != 0) {
        seqs <- DNAStringSet(primer.df$Forward[fw.idx])
        names(seqs) <- primer.df$ID[fw.idx]
        primer.seqs <- DNAStringSet(seqs)
        template.seqs <- template.df$Sequence
        names(template.seqs) <- template.df$Identifier
        template.seqs <- DNAStringSet(template.seqs)
        fw.sites <- check_restriction_sites_single(primer.seqs, template.seqs, 
                    adapter.action, 
                    direction = "fw",
                    only.confident.calls = only.confident.calls,
                    updateProgress = updateProgress)
    }
    if (length(rev.idx) != 0) {
        seqs <- primer.df$Reverse[rev.idx]
        names(seqs) <- primer.df$ID[rev.idx]
        primer.seqs <- DNAStringSet(seqs)
        template.seqs <- reverseComplement(template.seqs)
        rev.sites <- check_restriction_sites_single(primer.seqs, template.seqs, 
                    adapter.action, 
                    direction = "rev", 
                    only.confident.calls = only.confident.calls,
                    updateProgress = updateProgress)
    }
    site.df <- rbind(fw.sites, rev.sites)
    return(site.df)
}
#' Update of Primer Binding Regions.
#'
#' Updates the relative primer binding sites in the templates
#' when the template binding regions have changed since the last
#' coverage computation.
#'
#' @param primer.df A \code{Primers} data frame.
#' @param template.df Templates with the new binding regions.
#' @param old.template.df Templates with the old binding regions.
#' @return A \code{Primers} object with updated relative binding positions.
#' @keywords internal
update_primer_binding_regions <- function(primer.df, template.df, old.template.df) {
    if (length(primer.df) == 0 || nrow(primer.df) == 0) {
        return(primer.df)
    }
    if (!is(primer.df, "Primers")) {
        stop("Please input a 'Primers' object.")
    }
    if (!"primer_coverage" %in% colnames(primer.df)) {
        warning("Cannot update primer binding regions as no coverage was available.")
        return(primer.df)
    }
    if (!is(template.df, "Templates") || !is(old.template.df, "Templates")) {
        stop("Need to have 'Templates' objects to update the primer regions.")
    }
    # check that new region annotations are for the same set of templates
    if (nrow(template.df) != nrow(old.template.df) || template.df$ID != old.template.df$ID) {
        # template sets do not match -> don't adjust anything
        return(primer.df)
    }
     # update the current primers object in rv_primers
    fw.region <- list(template.df$Allowed_Start_fw, template.df$Allowed_End_fw)
    rev.region <- list(template.df$Allowed_Start_rev, template.df$Allowed_End_rev)
    fw.region.old <- list(old.template.df$Allowed_Start_fw, old.template.df$Allowed_End_fw)
    rev.region.old <- list(old.template.df$Allowed_Start_rev, old.template.df$Allowed_End_rev)
    # update the relative binding regions of the primers according to the changes from new to old regions
    # for fw: 'end' defines the relatie position of interest
    fw.diff <- fw.region[[2]] - fw.region.old[[2]]
    # for reverse: start defines the position of relative interest
    rev.diff <- rev.region[[1]] - rev.region.old[[1]]
    dirs <- c("_fw", "_rev")
    # cols with relation to forward binding region:
    relative.cols.fw <- unlist(lapply(dirs, function(x) paste0("Relative_Forward_Binding_Position_", c("Start", "End"), x)))
    # this won't work if we have paired primers -> need to know covered_seqs_fw and covered_seqs_rev in this case
    template.ids <- lapply(strsplit(primer.df$Covered_Seqs, split = ","), as.numeric)
    # assume template order hasn't changed:
    template.idx <- lapply(template.ids, function(x) match(x, template.df$Identifier))
    for (i in seq_along(relative.cols.fw)) {
        col <- relative.cols.fw[i]
        vals <- lapply(strsplit(primer.df[, col], split = ","), as.numeric)
        len.ok <- unlist(lapply(seq_along(vals), function(x) length(vals[[x]]) == length(template.idx[[x]])))
        if (any(!len.ok) && any(primer.df[,col] != "")) {
            warning("Can't adjust binding range for paired primers at the moment.")
            return(primer.df)
        }
        # for adjustment, need to get the template-specific difference
        adj.vals <- unlist(lapply(seq_along(vals), function(x) paste(vals[[x]] - fw.diff[template.idx[[x]]], collapse = ",")))
        primer.df[, col] <- adj.vals
    }
    relative.cols.rev <- unlist(lapply(dirs, function(x) paste0("Relative_Reverse_Binding_Position_", c("Start", "End"), x)))
     for (i in seq_along(relative.cols.rev)) {
        col <- relative.cols.rev[i]
        vals <- lapply(strsplit(primer.df[, col], split = ","), as.numeric)
        len.ok <- unlist(lapply(seq_along(vals), function(x) length(vals[[x]]) == length(template.idx[[x]])))
        if (any(!len.ok) && any(primer.df[,col] != "")) {
            warning("Can't adjust binding range for paired primers at the moment.")
            return(primer.df)
        }
        # for adjustment, need to get the template-specific difference
        adj.vals <- unlist(lapply(seq_along(vals), function(x) paste(vals[[x]] + rev.diff[template.idx[[x]]], collapse = ",")))
        primer.df[, col] <- adj.vals
    }
    return(primer.df)
}
#' Identification of Restriction Sites.
#'
#' Identifies restriction sites in a list with putative
#' restriction sites provided by \code{bad.regions} using
#' a data frame of restriction sites given by \code{DB}.

#' @param bad.region IRanges with possible adapter sites.
#' @param DB A data frame with restriction enzyme sites.
#' @return A boolean data frame indicating the presence
#' of adapters for all considered restriction sites.
#' @keywords internal
restriction_hits <- function(bad.regions, DB) {
    bad.seqs <- unlist(lapply(bad.regions, function(x) as.character(unlist(x))))
    names(bad.seqs) <- unlist(lapply(seq_along(bad.regions), function(x) rep(names(bad.regions)[x], length(unlist(bad.regions[[x]])))))
    i <- NULL
    hit.db <- foreach (i = seq_len(nrow(DB)), .combine = cbind) %dopar% {
        enzyme <- as.character(DB$nam[i])
        temp <- vmatchPattern(DB$rep[i], bad.seqs, 
                max.mismatch = 0, with.indels = FALSE)
        hits <- sapply(temp, function(x) length(x) != 0)
        if (length(hits) == 0) {
            NULL
        } else {
            hit.idx <- which(hits)
            result <- data.frame("Hit" = hits, stringsAsFactors = FALSE) 
            colnames(result)[colnames(result) == "Hit"] <- enzyme
            result
        }
    }
    if (length(hit.db) != 0) {
        attr(hit.db, "ID") <- names(bad.seqs)
    }
    return(hit.db)
}

#' Identification of Badly Fitting Regions.
#'
#' Identify regions in the templates where the primers are
#' not very complementary. These regions indicate possible
#' restriction enzyme adapters.
#'
#' @param primer.seqs Primer sequences.
#' @param template.seqs Template sequences.
#' @param search.hits Template substrings that agree well
#' with the input primers.
#' @return A list with putative restriction sites for every primer.
#' @keywords internal
restriction_ali <- function(primer.seqs, template.seqs, search.hits) {
    bad.regions <- vector("list", length(search.hits))
    # define substitution matrix for alignment
    mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = 0, 
                                                    baseOnly = FALSE, type = "DNA")
    for (i in seq_along(search.hits)) {
        # align primer with hit region in the templates
        if (is.na(search.hits[i])) {
            next
        }
        ali <- pairwiseAlignment(primer.seqs[i], search.hits[i],
                    substitutionMatrix = mat,
                     gapOpening = -3, gapExtension = -1, type = "global-local") # open up gaps directly
        # identify indels and non-matching regions
        # minimal number of bases of restriction 
        restrict.size <- 3
        if (length(ali@subject@indel[[1]]) != 0) {
            # there were indels -> look at the indel regions
            indel <- ali@subject@indel[[1]]
            # only select indels larger than cutoff
            sel <- indel@width >= restrict.size
            indel <- indel[sel]
            if (length(indel) != 0) {
                site <- extractAt(primer.seqs[i], indel)
                bad.regions[[i]] <- unname(site)
            }
        } else {
            # no indels -> look at the longest mismatch region
            mm.region <- unlist(mismatch(ali@pattern))
            # discard short mismatch regions:
            if (length(mm.region) > restrict.size) { 
                # more than 3 mismatches -> consider as adapter seq
                site <- extractAt(primer.seqs[i], 
                        IRanges(min(mm.region), max(mm.region)))
                bad.regions[[i]] <- unname(site)
            }
        }
    }
    names(bad.regions) <- names(primer.seqs)
    return(bad.regions)
}
#' Identification of Sequence Matches.
#'
#' Determines the most similar template sequence for every
#' input primer sequence. Used to identify regions for
#' alignment for the identification of restriction sites.
#'
#' @param primer.seqs Primer sequences.
#' @param template.seqs Template sequences.
#' @return A vector with the template regions matching the 
#' \code{primer.seqs} best.
#' @keywords internal
restriction_match <- function(primer.seqs, template.seqs) {
    search.hits <- rep(NA, length(primer.seqs))
    for (i in seq_along(primer.seqs)) {
        scores <- rep(Inf, length(template.seqs))
        best.ali <- NA
        for (j in seq_len(length(template.seqs))) { # n^2 * n^2 = n^4 performance ...
            # vmatchpattern doesn't support indels ...
            # max.mismatch: gives the detection limit for adapter sequences/overhang extent
            d <- matchPattern(as.character(primer.seqs[i]), 
                    as.character(template.seqs[j]), 
                    with.indels = TRUE, 
                    max.mismatch = 12)
            if (length(d@ranges) == 0) {
                # no hits
                next
            }
            # retrieve hits in the templates
            hits <- sapply(d, function(x) as.character(x))
            # determine best hit by computing distance between region of interest and pattern
            d <- stringdist::stringdist(hits, as.character(primer.seqs[i]))
            # select the hit with the smallest edit distance
            sel <- which.min(d)
            scores[j] <- d[sel]
            hit <- hits[sel]
            if (scores[j] <= min(scores)) {
                best.ali <- hit
            }
        }
        search.hits[i] <- best.ali
    }
    return(search.hits)
}
#' Identification of Sequence Restriction Sites.
#'
#' Checks the input sequences \code{seqs} for the presence of
#' restriction sites. By removing the restriction sites from a primer set, 
#' it is possible to identify the coverage of the primers
#' (e.g. using \code{\link{check_constraints}}) discounting for 
#' the impact of the mismatching bases caused by the insert.
#'
#' @param primer.seqs Nucleotide sequences of primers to be checked for restriction sites in terms of a \code{DNAStringSet} object.
#' @param template.seqs A \code{DNAStringSet} object with nucleotide sequences 
#' containing the templates corresponding to \code{seqs}.
#' @param adapter.action The action to be performed when adapter sequences
#' are found. Either "warn" to issue warning about adapter sequences or
#' "rm" to remove identified adapter sequences.
#' @param The primer direction that is checked.
#' @param selected Names of restriction sites that are to be checked.
#' By default \code{selected} is \code{NULL} in which case all REBASE 
#' restriction sites are checked.
#' @param only.confident.calls Only output confident calls of restriction sites.
#' @param updateProgress A Shiny progress callback function.
#' @return A data frame with restriction sites, if any could be found.
#' @references
#' Roberts, R.J., Vincze, T., Posfai, J., Macelis, D. (2010) REBASE–a database for DNA restriction
#' and modification: enzymes, genes and genomes. Nucl. Acids Res. 38: D234-D236. http://rebase.neb.com
#' @keywords internal
check_restriction_sites_single <- function(primer.seqs, template.seqs, adapter.action, 
                            direction = c("fw", "rev"),
                            selected = NULL, 
                            only.confident.calls = TRUE,
                            
                            updateProgress = NULL) {

    if (length(template.seqs) == 0 || length(primer.seqs) == 0) {
        return(NULL)
    }
    if (!is(primer.seqs, "DNAStringSet")) {
        stop("Primers should be a DNAStringSet.")
    }
    if (!is(template.seqs, "DNAStringSet")) {
        stop("Primers should be a DNAStringSet.")
    }
    if (length(direction) == 0) {
        stop("Please provide the 'direction' argument.")
    }
    direction <- match.arg(direction)
    #############
    # n.b.: this approach only works if the primer makes sense at all
    # if the primer isn't really complementary to the templates, 
    # we also won't be able to find restriction sites.
    # sysdata is available within the package automatically: enzdata
    #############
    # use only restriction sites without degeneracies for simplicity..
    counts <- sapply(regmatches(enzdata$site, gregexpr("[a-z]", enzdata$site, perl=TRUE)), length)
    sel <- which(counts == 0)
    DB <- enzdata[sel,]
    if (!is.null(selected)) {
        m <- match(selected, DB$nam)
        if (any(is.na(m))) {
            msg <- paste("Selected enzyme not available: ",
                paste(selected[is.na(m)], collapse = ", "), sep = "")
            stop(msg)
        }
        DB <- DB[m,]
    }
    if (is.function(updateProgress)) {
        detail <- "String matching"
        updateProgress(1/3, detail, "inc")
    }
    ##########
    # Confident calls: check for mismatches in the templates
    ##########
    # 1. Identify matches of primers in templates
    search.hits <- restriction_match(primer.seqs, template.seqs)
    if (is.function(updateProgress)) {
        detail <- "Aligning"
        updateProgress(1/3, detail, "inc")
    }
    # 2. Find the non-matching regions in the templates
    bad.regions <- restriction_ali(primer.seqs, template.seqs, search.hits)
    #####
    # Non-confident call: input primers
    #######
    input.regions <- vector("list", length(primer.seqs))
    names(input.regions) <- names(bad.regions)
    for (i in seq_along(primer.seqs)) {
        input.regions[[i]] <- DNAStringSetList(as.character(primer.seqs[i]))
    }
    all.regions <- list("Confident" = bad.regions, "Unconfident" = input.regions)
    if (is.function(updateProgress)) {
        detail <- "Identifying adapters"
        updateProgress(1/3, detail, "inc")
    }
    # 3. Identify whether the non-matching regions are restriction sites
    all.hits <- vector("list", length(all.regions))
    for (j in seq_along(all.regions)) {
        bad.regions <- all.regions[[j]]
        hit.db <- restriction_hits(bad.regions, DB) 
        if (is.null(hit.db)) {
            next
        }
        # 4. from all hits, only consider the common restriction sites
        common.DB <- DB[DB$common,]
        idx <- lapply(seq_len(nrow(hit.db)), function(x) which(as.logical(hit.db[x,])))
        hit.names <- lapply(idx, function(x) colnames(hit.db)[x])
        found.DB.idx <- lapply(seq_along(hit.names), function(x) idx[[x]][which(hit.names[[x]] %in% common.DB$nam)])
        no.hit.idx <- intersect(which(sapply(found.DB.idx, length) == 0), which(sapply(idx, length) != 0))
        if (length(no.hit.idx) != 0) {
            # no hit in the common restriction sites -> check for the most specific rare restriction site
            found.DB.idx[no.hit.idx] <- lapply(seq_along(no.hit.idx), function(x) idx[[no.hit.idx[x]]][which.max(nchar(DB$rep[idx[[no.hit.idx[x]]]]))])
        }
        if (length(found.DB.idx) == 0) {
            # still no hits? we're done
            next
        }
        names(found.DB.idx) <- attr(hit.db, "ID")
        hit.out <- vector("list", length(found.DB.idx))
        for (i in seq_along(found.DB.idx)) {
            if (length(found.DB.idx[[i]]) == 0) {
                # no hits found
                next
            }
            p.id <- names(found.DB.idx)[i]
            p.idx <- match(p.id, names(primer.seqs@ranges))
            out <- data.frame("Primer_Identifier" = p.id,
                              "Direction" = direction,
                              "Sequence" = unname(tolower(as.character(primer.seqs[p.idx]))),
                        "Enzyme" = as.character(DB$nam)[found.DB.idx[[i]]],
                        "Site" = tolower(DB$rep[found.DB.idx[[i]]]),
                        stringsAsFactors = FALSE)
            hit.out[[i]] <- out
        }
        hit.out <- do.call(rbind, hit.out)
        if (length(hit.out) != 0) {
            hit.out$Confidence <- names(all.regions)[j]
        }
        all.hits[[j]] <- hit.out
    }
    hit.out <- do.call(rbind, all.hits)
    # only return unique hits per primer
    hit.out <- ddply(hit.out, c("Primer_Identifier", "Sequence", "Enzyme", "Site"),
            function(x) arrange(x, substitute(Confidence))[1, ])
    if (only.confident.calls) {
        hit.out <- hit.out[hit.out$Confidence == "Confident",]
    }
    # warn about restriction sites:
    if (adapter.action == "warn") {
        # warn about restriction sites
        enzymes <- ddply(hit.out, c("Enzyme", "Site"), summarize, 
                    Identifiers = paste(substitute(Primer_Identifier), collapse = ","))
        if (nrow(enzymes) != 0) {
            out <- sapply(seq_len(nrow(enzymes)), function(x) 
                    paste("Found ", enzymes[x,"Enzyme"], " (", enzymes[x,"Site"], 
                    ") adapter in primers ", enzymes[x,"Identifiers"], 
                    ". Please check your sequences.", sep = ""))
            warning(paste(out, collapse = "\n"))
        }
    } else if (adapter.action == "rm") {
        # remove restriction sites from primers
        warning("Removal is not implemented yet.")
    }
    return(hit.out)
}
#' @rdname Input
#' @name Input
#' @aliases Primers-class
#' @section Basic columns:
#' In the following you can find a description of the most
#' important columns that can be found in objects of class \code{Primers}. 
#' Note that angular brackets indicate the existence of multiple possibilities.
#' The following columns are present when a set of primers
#' is loaded from a FASTA file using \code{\link{read_primers}}:
#' \describe{
#' \item{\code{ID}}{The identifiers of the primers.}
#' \item{\code{Identifier}}{The internal identifiers of the primers.}
#' \item{\code{Forward}}{The sequences of forward primers.}
#' \item{\code{Reverse}}{The sequences of reverse primers.}
#' \item{\code{primer_length<fw|rev>}}{The lengths of
#' forward and reverse primer sequences, respectively.}
#' \item{\code{Direction}}{Either 'fw' for forward primers,
#' 'rev' for reverse primers, or 'both' for a primer pair.}
#' \item{\code{Degeneracy_<fw|rev>}}{The degeneracy (ambiguity) of
#' forward and reverse primers, respectively.}
#' \item{\code{Run}}{An identifier describing the primer set.}
#' }
#'
#' @section Coverage-related columns:
#' The following columns are only available in an object of
#' class \code{Primers} after primer coverage
#' has been computed, that is after \code{\link{check_constraints}}
#' has been called with the active \code{primer_coverage} constraint. Computed coverage
#' values relating solely to string matching are indicated by the prefix
#' \code{Basic_}, while columns without this prefix relate to the coverage after
#' applying the constraints formulated via \code{CoverageConstraints}.
#' Information on off-target coverage events are indicated by
#' the \code{Off_} prefix, while on-target coverage events do not carry
#' this prefix.
#' 
#' \describe{
#' \item{\code{primer_coverage}}{The number of templates that are
#' covered by the primers. Note that if a primer set contains
#' primers of both directions, a template is only considered covered
#' if it is covered by primers of both directions.}
#' \item{\code{Coverage_Ratio}}{The ratio of templates that are covered by the primers.}
#' \item{\code{Binding_Position_Start_<fw|rev>}}{The upstream position in the 
#' templates where forward and reverse primers respectively bind.}
#' \item{\code{Binding_Position_End_<fw|rev>}}{The downstream position in the templates where forward and reverse primers respectively bind.}
#' \item{\code{Relative_<Forward|Reverse>_Binding_Position_<Start|End>_<fw|rev>}}{
#' The binding upstream (\code{Start}) or downstream (\code{End}) positions 
#' of the primers relative to the forward (\code{Forward})
#' or reverse (\code{Reverse}) binding regions, either for 
#' forward (\code{fw}) or reverse primers (\code{rev}).}
#' \item{\code{Binding_Region_Allowed}}{Whether a coverage event
#' occurred in the target binding region or not. If the allowed
#' off-target ratio was set to 0 only coverage events within the 
#' the target region are reported.}
#' \item{\code{Nbr_of_mismatches_<fw|rev>}}{The number of mismatches
#' of forward and reverse primer coverage events, respectively.}
#' \item{\code{Mismatch_pos_<fw|rev>}}{The position of mismatches
#' for forward and reverse coverage events, respectively. Mismatch
#' positions are reported relative to the 3' end, that is, position
#' 1 indicates a mismatch in the last base of a primer.}
#' \item{\code{primer_specificity}}{The specificity of a primer
#' as determined by its ratio of off-target binding events.}
#' }
#' 
#' @section Constraint-related columns:
#' Each constraint that is considered when calling \code{\link{check_constraints}}
#' gives rise to at least one column in the provided \code{Primers} object.
#' Due to the large number of possible constraints, we will limit our description
#' to the \code{gc_clamp} constraint. Once the GC clamp property has been computed,
#' the \code{gc_clamp_fw} column contains the length of the GC clamp for forward 
#' primers and \code{gc_clamp_rev} the corresponding length for reverse primers.
#' Whether the desired extent of the GC clamp was obtained by a primer
#' is indicated by the \code{EVAL_gc_clamp} column. It contains \code{TRUE} when
#' the GC clamp constraint was fulfilled and \code{FALSE} when it was broken.
#' To identify whether all required constraints were fulfilled by a primer,
#' the \code{constraints_passed} column can be used. It contains \code{TRUE}
#' if all \code{active.constraints} used by \code{\link{check_constraints}} were fulfilled
#' and \code{FALSE} otherwise.
#'
#' @return The \code{Primers} constructor returns an object of 
#' class \code{Primers}.
#' @exportClass Primers
#' @export
#' @examples
#' 
#' # Load a set of primers
#' primer.location <- system.file("extdata", "IMGT_data", "primers", "IGHV", 
#'                      "Ippolito2012.fasta", package = "openPrimeR")
#' primer.df <- read_primers(primer.location, "_fw", "_rev")
setClass("Primers",
   contains = c("data.frame"), validity = validate_primers)


setMethod("initialize", "Primers",
    function(.Object, ...) {
        # just call data frame constructor
        Object <- callNextMethod(.Object, ...)
    }
) 
#' @rdname Input
#' @name Input
#' @aliases Primers-class
#' @export
Primers <- function(...) new("Primers", ...) 

my_format_df <- function (x) {
    if (is.atomic(x) && !is.null(x)) {
        stop("Internal structure doesn't seem to be a list. Possibly corrupt data.frame.")
    }
    classes <- unlist(lapply(x, class))
    factor.idx <- which(classes == "factor")
    for (i in factor.idx) {
        x[, i] <- as.character(x[, i])
    }
    char.trunc <- function(x, trunc.char = 20) {
        trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE)
        if (!is.character(x) || trunc.char <= 0L) return(x)
        idx = which(nchar(x) > trunc.char)
        x[idx] = paste(substr(x[idx], 1L, as.integer(trunc.char)), "...", sep="")
        x
    }
    # truncate all strings
    classes <- unlist(lapply(x, class))
    string.idx <- which(classes == "character")
    for (i in string.idx) {
        x[, i] <- char.trunc(x[,i])
    }
    return(x)
}

my_show_df <- function(x, topn = 3,
                       nrows = 6) {
    # topn  - print the top topn and bottom topn rows with '---' inbetween 
    # nrows - if x is <= nrows, the whle table is printed
    if (!is.numeric(nrows)) nrows = 10
    if (!is.infinite(nrows)) nrows = as.integer(nrows)
    if (nrows <= 0L) return(invisible())   # ability to turn off printing
    if (!is.numeric(topn)) topn = 5
    topnmiss = missing(topn)
    topn = max(as.integer(topn),1L)
    if (nrow(x) == 0L) {
        if (length(x)==0L)
           cat(paste0("Empty object of class '", class(x), "'.\n"))
        else
           cat(paste0("Object of class '", class(x), "' without any rows.\n"))
        return(invisible())
    }
    if (topn*2<nrow(x) && (nrow(x)>nrows || !topnmiss)) {
        toprint = rbind(head(x, topn), tail(x, topn))
        rn = c(seq_len(topn), seq.int(to=nrow(x), length.out=topn))
        printdots = TRUE
    } else {
        toprint = x
        rn = seq_len(nrow(x))
        printdots = FALSE
    }
    toprint = my_format_df(x) # modify data frame to have only character classes 
    rownames(toprint) <- seq_len(nrow(toprint))
    if (is.null(names(x)) | all(names(x) == "")) colnames(toprint)=rep("", ncol(toprint))
    if (printdots) {
        toprint = rbind(head(toprint,topn),"---" = "",tail(toprint,topn))
        rownames(toprint) = format(rownames(toprint),justify="right")
        print(toprint,right=TRUE)
        return(invisible())
    }
    if (nrow(toprint)>20L)
        # repeat colnames at the bottom if more than 20 rows are printed
        toprint=rbind(toprint,matrix(colnames(toprint),nrow=1))
    print(toprint,right=TRUE)
    invisible()
}
setMethod("show", "Primers", function(object) {
    # overwrite the 'print' function in order to limit the output
    my_show_df(asS3(object))
})
setMethod("summary", "Primers", function(object) {
    stats <- primer.set.parameter.stats(object, get.analysis.mode(object), NULL)
    return(stats)
})

#' cbind for Primers class.
#'
#' Ensures that the cbind result has the appropriate class.
#'
#' @param ... Parameters for cbind function.
#' @return Column binded Primers data frame.
#' @keywords internal
#' @export
#' @examples
#' data(Ippolito)
#' primer.df <- cbind(primer.df, primer.df)
cbind.Primers <- function(...) {
    df <- cbind.data.frame(...)
    df <- Primers(df)
    return(df)
}
#' S4 cbind for Primers.
#'
#' S4 cbind function for Primers.
#'
#' @export
#' @rdname Primers-method
#' @param x The Primers data frame.
#' @param y Another data frame.
#' @return Cbinded primer data frame.
#' @keywords internal
#' @examples
#' data(Ippolito)
#' primer.df <- cbind2(primer.df, seq_len(nrow(primer.df)))
setMethod("cbind2", c(x = "Primers", y = "ANY"),
    # need this in case of S3 dispatch fails.
    function(x, y, ...) {
        df <- cbind.data.frame(x, y, ...) # packages should not call .Internal()?
        df <- Primers(df)
        return(df)
    }
)
#' S4 rbind for Primers.
#'
#' S4 rbind function for Primers.
#'
#' @export
#' @return Rbinded primer data frame.
#' @rdname Primers-method
#' @keywords internal
setMethod("rbind2", c(x = "Primers", y = "ANY"),
    # need this in case of S3 dispatch fails.
    function(x, y, ...) {
        df <- rbind.data.frame(x, y, ...)
        df <- Primers(df)
        return(df)
    }
)
#' rbind for Primers class.
#'
#' Ensures that the rbind result has the appropriate class.
#'
#' @param ... Parameters for rbind function.
#' @return Row-binded Primers data frame.
#' @keywords internal
#' @export
#' @examples
#' data(Ippolito)
#' primer.df <- rbind(primer.df, primer.df)
rbind.Primers <- function(...) {
    df <- rbind.data.frame(...)
    df <- Primers(df)
    return(df)
}

#' Slicing Operator for Primers.
#'
#' Slices a Primers data frame.
#'
#' @param x The Primers data frame.
#' @param i The row index.
#' @param j The column index.
#' @param ... Other arguments to the slice operator.
#' @param drop Simplify data frame?
#' @exportMethod [
#' @rdname Primers-method
#' @return Subset of primer data frame.
#' @keywords internal
#' @aliases [,Primers-method [,Primers,ANY-method
#' @examples
#' data(Ippolito)
#' primer.df <- primer.df[1:2,]
setMethod("[", c("Primers", "ANY", "ANY"),
    function(x, i, j, ..., drop = TRUE) {
        if (missing(drop)) {
            df <- asS3(x)[i, j, ...]
        } else {
            df <- asS3(x)[i, j, ..., drop = drop]
        }
        if (is(df, "data.frame")) {
            # only set my class if we haven't simplified.
            #options("show.error.messages" = FALSE)
            p.df <- suppressWarnings(try(Primers(df), silent = TRUE))
            #options("show.error.messages" = TRUE)
            if (is(p.df, "try-error")) {
                # removed crucial columns -> turn into data frame
                p.df <- asS3(df)
            } 
            df <- p.df
        }
        return(df)
    }
)
#' Dollar Operator for Primers.
#'
#' Stores data in a column of a Primers data frame.
#'
#' @exportMethod $<-
#' @rdname Primers-method
#' @param name The name of the column.
#' @param value The values of the column.
#' @return Primer data frame with replaced column.
#' @keywords internal
#' @examples
#' data(Ippolito)
#' primer.df$Forward[1] <- "ctagcgggaccg"
setMethod("$<-", "Primers", 
    function(x, name, value) {
        df <- asS3(x)
        eval(parse(text = paste("df$", name, " <- value", sep = "")))
        df <- Primers(df)
        return(df)
    }
)
#' Updates the Primer Coverage.
#'
#' Updates the most important columns in a primer data frame according to the selected
#' coverage definition. Only coverage events with less or equal than the allowed number of
#' mismatches according to the selected coverage definition will be retained.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @param allowed.mismatches A numeric giving the maximal number of allowed.mismatches.
#' @param cvg.definition The definition of coverage to be used, either "constrained" or "basic".
#' @return A primer data frame with modified coverage information.
#' @keywords internal
update_primer_cvg <- function(primer.df, template.df, allowed.mismatches, cvg.definition = c("constrained", "basic")) {
    # note: only updates primer_coverage, covered_seqs, not binding positions etc.
    # Consider only coverage events with <= allowed.mismatches and according to the provided cvg.definition
    if (length(primer.df) == 0 || nrow(primer.df) == 0) {
        # nothing to update ..
        return(primer.df)
    }
    mode.directionality <- get.analysis.mode(primer.df)
    cvg.definition <- match.arg(cvg.definition)
    mm.info <- prepare_mm_plot(primer.df, template.df)
    # select events according to selected cvg.definition and number of mismatches only
    df <- mm.info[mm.info$Coverage_Type == cvg.definition & mm.info$Number_of_mismatches <= allowed.mismatches, ]
    ##################
    # just move from mismatch rep to coverage rep
    ddf <-  ddply(df, c("Primer", "Direction", "Template", "Group"), summarize,
                            Position = unique(substitute(Position_3terminus)), 
                            Number_of_mismatches = unique(substitute(Number_of_mismatches)))
    #####
    if (mode.directionality == "both") {
        # remove individual binding events
        # for pairs of primers: presence of any other other direction is fine for cvg
        dir.count.both <- ddply(ddf, c("Template", "Group"), summarize, DirectionCount = length(unique(substitute(Direction))))
        rm.idx <- which(dir.count.both$DirectionCount <= 1)
        if (length(rm.idx) != 0) {
            m <- match(dir.count.both$Template[rm.idx], ddf$Template)
            ddf <- ddf[-m,]
        }
    }
    # for primers of both directions: they were replicated (fw+both) earlier, need to select events were pairs are present and then select the individual primer once again
    both.sel <- which(primer.df$Direction[match(ddf$Primer, primer.df$ID)] == "both")
    dff.both <- ddply(ddf[both.sel,], c("Primer", "Template"),
        summarize, 
        Group = unique(substitute(Group)))
    dff.single <- ddf[setdiff(seq_len(nrow(ddf)), both.sel), c("Primer", "Template", "Group")]
    dff <- rbind(dff.both, dff.single)
    ##########
    cvd <- ddply(dff, "Primer", summarize, Covered_Seqs = paste(substitute(Template), collapse = ","))
    m <- match(cvd$Primer, primer.df$ID)
    new.df <- data.frame(ID = primer.df$ID, Covered_Seqs = "", primer_coverage = 0,
                         stringsAsFactors = FALSE)
    if (length(m) != 0) { # we found coverage events for our selection
        # set corresponding entries in new.df to updated values
        cvd.identifier <- unlist(lapply(strsplit(cvd$Covered_Seqs, split = ","), function(x) paste(template.df$Identifier[match(x, template.df$ID)], collapse = ",")))
        new.df[m, 'Covered_Seqs'] <- cvd.identifier
        new.df[m, 'primer_coverage'] <- sapply(strsplit(cvd.identifier, split = ","), length)
    }
    # update primer data frame with new values
    new.primer.df <- primer.df
    new.primer.df[, colnames(new.df)] <- new.df
    return(new.primer.df)
}

#' @rdname Plots
#' @name Plots
#' @aliases plot_primer_binding_regions
#' @return \code{plot_primer_binding_regions} returns a plot of the primer binding regions.
#' @export
#' @include templates.R
#' @examples
#' 
#' # Primer binding regions of a single primer set
#' data(Ippolito)
#' p <- plot_primer_binding_regions(primer.df, template.df)
#' # Primer binding regions of multiple primer sets
#' data(Comparison)
#' p.comp <- plot_primer_binding_regions(primer.data[1:3], template.data[1:3])
setGeneric("plot_primer_binding_regions", 
    function(primers, templates, direction = c("both", "fw", "rev"), 
        group = NULL, relation = c("fw", "rev"), 
        region.names = c("Binding region", "Amplification region"), ...) {
        standardGeneric("plot_primer_binding_regions")
})
#' Plot of Primer Binding Regions for a Single Primer Set.
#'
#' Plots the primer binding regions in the templates.
#'
#' @param primers An object of class \code{Primers} with annotated
#' primer coverage.
#' @param templates An object of class \code{Templates} providing the
#' template sequences corresponding to \code{primers}. 
#' @param direction Primer direction
#' @param group The template groups for which binding events should
#' be determined. By default, \code{group} is set to \code{NULL} such that
#' all templates are considered.
#' @param relation A character vector specifying whether binding region data shall
#' be plotted relative to the forward (\code{fw}) or reverse (\code{rev}) 
#' target binding regions.
#' @param region.names Names for the primer binding region and the amplified region.
#' @return A histogram of primer binding regions.
#' @keywords internal
setMethod("plot_primer_binding_regions", 
    signature(primers = "Primers", templates = "Templates"),
    function(primers, templates,
        direction = c("both", "fw", "rev"), group = NULL, 
        relation = c("fw", "rev"), 
        region.names = c("Binding region", "Amplification region")) {
        
        relation <- match.arg(relation)
        direction <- match.arg(direction)
        if (length(primers) == 0 && nrow(primers) == 0) {
            return(NULL)
        }
        if (!is(primers, "Primers")) {
            stop("Please input a valid primers object.")
        }
        primer.data <- list(primers)
        names(primer.data) <- unique(primers$Run)
        p <- plot_primer_binding_regions(primer.data, list(templates),
                direction, group, relation, region.names = region.names)
        return(p)
})
create_region_boxes <- function(primers, templates, relation, region.names, ymin, ymax, xmax) {
    regions <- vector("list", length(templates)) # binding regions
    if (relation == "fw") {
        for (i in seq_along(templates)) {
            template.df <- templates[[i]]
            region.extents <- template.df$Allowed_End_fw_initial - template.df$Allowed_Start_fw_initial
            idx <- which.max(region.extents)
            s <- template.df$Allowed_Start_fw_initial[idx]
            e <- template.df$Allowed_End_fw_initial[idx]
            e.now <- template.df$Allowed_End_fw[idx]
            s.now <- template.df$Allowed_Start_fw[idx]
            delta <- e.now - e
            rel.binding.start <- -(e.now - s.now) - 1
            rect.x.leader <- -((e-s) + delta) - 1
            rect.x.leader.end <- rect.x.leader + e - s
            region.info.l <- data.frame("Region" = region.names[1], "xmin" = rect.x.leader, "xmax" = rect.x.leader.end, "ymin" = ymin, "ymax" = ymax)
            region.info.t <- data.frame("Region" = region.names[2], "xmin" = rect.x.leader.end + 1, "xmax" = xmax, "ymin" = ymin, "ymax" = ymax)
            region.df <- rbind(region.info.l, region.info.t)
            region.df$Run <- unique(primers[i]$Run)
            region.df$RelStartPosition <- rel.binding.start
            regions[[i]] <- region.df 
            #binding.starts[[i]] <- data.frame("Run" = names(primer.data)[i], "Position" = rel.binding.start)
        }
    } else {
        for (i in seq_along(templates)) {
            template.df <- templates[[i]]
            region.extents <- template.df$Allowed_End_rev_initial - template.df$Allowed_Start_rev_initial
            idx <- which.max(region.extents)
            seq.len <- nchar(template.df$Sequence[idx])
            s <- seq.len - template.df$Allowed_Start_rev_initial[idx] + 1
            e <- seq.len - template.df$Allowed_End_rev_initial[idx] + 1
            e.now <- seq.len - template.df$Allowed_End_rev[idx] + 1
            s.now <- seq.len - template.df$Allowed_Start_rev[idx] + 1
            # positive delta: binding region has to be shifted back (negative), negative delta: binding region has to be shifted forward (positive)
            delta <- s - s.now
            rel.binding.start <- e.now - s.now - 1
            ########
             rect.x.leader <- -((e-s) + delta) - 1
            rect.x.leader.end <- rect.x.leader + e - s
            #########
            rect.x.leader <- -(s - e + delta) - 1
            rect.x.leader.end <- rect.x.leader - e + s
            region.info.l <- data.frame("Region" = region.names[1], "xmin" = rect.x.leader, "xmax" = rect.x.leader.end, "ymin" = ymin, "ymax" = ymax)
            region.info.t <- data.frame("Region" = region.names[2], "xmin" = rect.x.leader.end + 1, "xmax" = xmax, "ymin" = ymin, "ymax" = ymax)
            region.df <- rbind(region.info.l, region.info.t)
            region.df$Run <- unique(primers[i]$Run)
            region.df$RelStartPosition = rel.binding.start
            regions[[i]] <- region.df 
            #binding.starts[[i]] <- data.frame("Run" = names(primer.data)[i], "RelStartPosition" = rel.binding.start)
        }
    }
    region.df <- do.call(rbind, regions)
    return(region.df)
}
#' Plot of Primer Binding Regions for Multiple Sets.
#'
#' Plots the primer binding regions for every primer set.
#'
#' @param primers List with primer data frames.
#' @param templates List with template data frames.
#' @param direction Direction of primers.
#' @param group Template groups to plot.
#' This defaults to plotting all groups.
#' @param relation Plot binding region relative to forward binding region or reverse?
#' @param region.names Names for the primer binding region and the amplified region.
#' @param highlight.set Primer sets to highlight in the plot.
#' @return A plot for primer binding region comparison.
#' @keywords internal
setMethod("plot_primer_binding_regions", 
    signature(primers = "list", templates = "list"),
    function(primers, templates,
        direction = c("both", "fw", "rev"), group = NULL, 
        relation = c("fw", "rev"), 
        region.names = c("Binding region", "Amplification region"), 
        highlight.set = NULL) {

    if (length(region.names) != 2) {
        stop("Need 2 region names.")
    }
    xlab <- "Location"
    direction <- match.arg(direction)
    relation <- match.arg(relation)
    xlab <- "Binding position"
    plot.data <- lapply(seq_along(primers), function(x) primer.binding.regions.data(primers[[x]], 
        templates[[x]], direction, group, relation))
    # annotate with run info
    runs <- get.run.names(primers)
    plot.data <- lapply(seq_along(plot.data), function(x) if (length(plot.data[[x]]) != 
        0 && nrow(plot.data[[x]]) != 0) {
        cbind(plot.data[[x]], Run = rep(primers[[x]]$Run[1], nrow(plot.data[[x]])))
    })
    plot.df <- do.call(rbind, plot.data)
    if (!is(plot.df, "data.frame")) {
        return(NULL)
    }
    if (length(highlight.set) != 0) {
        # check whether highlight set is specified correctly
        m <- match(highlight.set, plot.df$Run)
        na.idx <- which(is.na(m))
        if (length(na.idx) != 0) {
            msg <- paste("Highlight set not available in data:",
                paste(highlight.set[na.idx], collapse = ","))
            warning(msg)
            highlight.set <- highlight.set[!is.na(m)]
        }
    }
    # new annotation function for rectangle plot
    plot.df <- ddply(plot.df, c("ID", "Start", "End", "Run"), summarize, Count = length(substitute(ID)))
    # cumulate counts for multiple primer IDs -> ymax value for rectangles
    plot.df$ymax <- ave(plot.df$Count, plot.df$Start, plot.df$End, plot.df$Run, FUN = cumsum)
    # ymin: cumulative sum minus the current count
    plot.df$ymin <- plot.df$ymax - plot.df$Count
    colnames(plot.df)[colnames(plot.df) %in% c("Start", "End")] <- c("xmin", "xmax")
    #print(plot.df)
    levels <- unique(plot.df$Run)
    plot.df$Run <- factor(plot.df$Run, levels = levels[order(levels)])
    bwidth <- 10 # cover 10 positions with one bar
    ymin <- min(c(-1, -0.1 * max(plot.df$ymax)))
    ymax <- -0.2 # slightly negative to retain the border of the box
    # consider widest range of allowed regions from all template sets:
    ############################
    # set boundaries of the plot:
    xmin <- min(plot.df$xmin)
    xmax <- max(plot.df$xmax)
    ## require a minimal length for the region segments
    if (xmin > 50) {
        xmin <- -50
    }
    if (xmax < 50) {
        xmax <- 50
    }
    region.df <- create_region_boxes(primers, templates, relation, region.names, ymin, ymax, xmax)
    xmin <- min(xmin, min(region.df$xmin))
    x.ticks <- pretty(seq(xmin, xmax))
    x.labels <- x.ticks
    x.labels[x.labels > 0] <- paste0("+", x.labels[x.labels > 0])
    rect.colors <- c("#e5f4ff", "#ffefe5")
    names(rect.colors) <- region.names
    r.colors <- rep(rect.colors, length(levels(plot.df$Run)))
    plot.df$ID <- abbreviate(plot.df$ID, getOption("openPrimeR.plot_abbrev")) # shorten primer IDs
    pal <- getOption("openPrimeR.plot_colors")["Primer"] # the RColorBrewer palette to use
    primer.colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df[, "ID"])))
    names(primer.colors) <- unique(plot.df[, "ID"])
    primer.colors <- c(rect.colors, primer.colors)
    p <- ggplot(plot.df) + ylab("Number of coverage events") + 
        xlab(xlab) + 
        ggtitle("Sites of primer binding in the templates")  +
        theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
         scale_x_continuous(limits = c(xmin, xmax),
                           breaks = x.ticks,
                           labels = x.labels) + 
        geom_vline(xintercept = -1, colour = "red") + # end of binding region
        geom_vline(data = region.df, aes_string(xintercept = "RelStartPosition"), colour = "red") + # start of binding region
        # rectangles for histogram of binding events:
        geom_rect(data = plot.df, 
                  mapping = aes_string(xmin = "xmin", xmax = "xmax",
                                      ymin = "ymin", ymax = "ymax",
                                      fill = "ID"),
                    linetype = "blank", alpha = 0.35) + # rectangles show some overlap due to the transparency!!
       scale_fill_manual(values = primer.colors, breaks = unique(plot.df$ID)) + 
        # x-axis rectangles to annotate binding/amplification region:
        geom_rect(data = region.df, 
            mapping = aes_string(xmin="xmin", xmax="xmax", 
                        ymin="ymin", ymax="ymax", fill = "Region"),
            alpha = 0.5,
            colour = "#3d3835", 
            size = 0.3, show.legend = FALSE)
    if (length(unique(plot.df$Run)) > 1) {
        # show facets and don't show individual primer legend
        p <- p + facet_wrap(~Run, ncol = 2) +
            guides(fill = FALSE)
    } else {
        # only show rectangle text for single plot
        p <- p + geom_text(data=region.df, 
            aes_string(x = "xmin+(xmax-xmin)/2", 
                       y = "ymin+(ymax-ymin)/2", 
                       label = "Region"), size = 4)

    }
    if (length(unique(plot.df$ID)) > 15) {
        # don't show legend for many primers
        p <- p + guides(fill = FALSE)
    }
    if (length(highlight.set) != 0) {
        # highlight selected sets
        # highlight of strip.text (facet part) individually doesn't work
        # -> this is the solution i came up with
        highlights <- data.frame(Run = highlight.set)
        p <- p + geom_rect(data=highlights,aes(xmin=-Inf, xmax=Inf, 
                    ymin=-Inf, ymax=Inf), fill='red', alpha=0.1)
    }
    return(p)
})

#' @rdname Input
#' @name Input
#' @aliases read_primers
#' @details
#' When loading primers via \code{read_primers}, the input arguments
#' \code{fw.id}, \code{rev.id}, \code{merge.ambig}, and \code{max.degen} 
#' are only used for loading primers from a FASTA file.
#' In this case, please ensure that \code{fw.id} and 
#' \code{rev.id} are set according to the keywords indicating
#' the primer directionalities in the FASTA file.
#' When loading primers from a CSV file, the format of the file should adhere
#' to the structure defined by the \code{\link{Primers}} class. 
#' 
#' @return \code{read_primers} returns a single object of class \code{Primers}
#' if a single input file is provided or a list of such objects if multiple
#' files are provided.
#' @export
#' @examples
#' 
#' primer.fasta <- system.file("extdata", "IMGT_data", "primers", "IGHV", 
#'                      "Ippolito2012.fasta", package = "openPrimeR")
#' primer.df <- read_primers(primer.fasta, "_fw", "_rev")
#' # Read multiple FASTA files
#' fasta.files <- list.files(system.file("extdata", "IMGT_data", "primers", 
#'                  "IGHV", package = "openPrimeR"), pattern = "*\\.fasta",
#'                  full.names = TRUE)[1:3]
#' primer.data <- read_primers(fasta.files)
#' # Read primers from a CSV file
#' primer.csv <- system.file("extdata", "IMGT_data", "comparison", 
#'              "primer_sets", "IGL", "IGL_openPrimeR2017.csv",  package = "openPrimeR")
#' primer.df <- read_primers(primer.csv)
#' # Read multiple primer CSV files
#' primer.files <- list.files(path = system.file("extdata", "IMGT_data", "comparison", 
#'                          "primer_sets", "IGH", package = "openPrimeR"),
#'                           pattern = "*\\.csv", full.names = TRUE)[1:3]
#' primer.data <- read_primers(primer.files)
#' # Read a mixture of FASTA/CSV files:
#' mixed.primers <- c(primer.fasta, primer.csv)
#' primer.data <- read_primers(mixed.primers)
read_primers <- function(fname, fw.id = "_fw", rev.id = "_rev", 
                    merge.ambig = c("none", "merge", "unmerge"), 
                    max.degen = 16, template.df = NULL, 
                    adapter.action = c("warn", "rm"), sample.name = NULL,
                    updateProgress = NULL) {

    adapter.action <- match.arg(adapter.action)
    if (is.function(updateProgress)) {
        detail <- "Reading primers"
        updateProgress(1/2, detail, "inc")
    }
    if (length(fname) > 1) {
        # load multiple primer FASTA/CSV files
        primers <- read_primers_multiple(fname, fw.id = fw.id, 
                    rev.id = rev.id, merge.ambig = merge.ambig, max.degen = max.degen, 
                    template.df = template.df,
                    adapter.action = adapter.action, sample.name = sample.name,
                    updateProgress = updateProgress) 
    } else {
        # load a single primer set from FASTA/CSV
        primers <- read_primers_single(fname, fw.id = fw.id, rev.id = rev.id,
                    merge.ambig = merge.ambig, max.degen = max.degen, 
                    template.df = template.df,
                    adapter.action = adapter.action, sample.name = sample.name,
                    updateProgress = updateProgress) 
    }
    return(primers)
}
read_primers_single <- function(primer.location, fw.id = "_fw", rev.id = "_rev", 
                    merge.ambig = c("none", "merge", "unmerge"), 
                    max.degen = 16, template.df = NULL, 
                    adapter.action = c("warn", "rm"), sample.name = NULL,
                    updateProgress = NULL) {

    if (!file.exists(primer.location)) {
        stop(paste("No file found at specified location: ",
                primer.location, sep = ""))
    }
    # load the primers and at the same time determine whether it's FASTA/CSV input
    primers <- try(my.read.fasta(primer.location, 
                tolower(names(IUPAC_CODE_MAP))), silent = TRUE)
    if (is(primers, "try-error")) {
        # let's try to read as CSV
        fasta.error <- attr(primers, "condition")
        primers <- try(read_primers_csv(primer.location), silent = TRUE)
        if (is(primers, "try-error")) {
            csv.error <- attr(primers, "condition")
            stop(paste("Could not read the file as FASTA or CSV.",
                "The FASTA error was: ",
                fasta.error, ".\n The CSV error was: ", csv.error))
       } else {
            # it's a CSV:
            ext <- "csv"
       }
    } else {
        # it's a FASTA:
        ext <- "fasta"
    }
    if (ext == "csv") {
        # nothing to change in the input
    } else if (ext == "fasta") {
        # sanitize the input
        if (length(primers) == 0) {
            return(NULL)
        }
        if (is.function(updateProgress)) {
            detail <- "Annotating primers"
            updateProgress(1/2, detail, "inc")
        }
        primer.seqs <- sapply(primers, function(x) paste(tolower(x), collapse = ""))
        headers <- sapply(primers, function(x) attr(x, "Annot"))
        if (length(sample.name) == 0) { 
            # use fasta name without file ending as sample name
            sample.name <- sub("^([^.]*).*", "\\1", basename(primer.location))
        }
        # create Primers object:
        primers <- read_primers.internal(primer.seqs, headers, fw.id, rev.id, 
                        merge.ambig, max.degen, sample.name)
    } else {
        stop("Unknown filetype.")
    }
    if (!is.null(template.df)) {
        # check for restriction sites if template.df was inputted
        sites <- check_restriction_sites(primers, template.df, 
                    adapter.action, updateProgress = updateProgress)
    }
    return(primers)
}
#' Input of Multiple Primer Sets.
#'
#' Reads multiple CSV files representing stored objects of class \code{Primers}.
#'
#' @param filenames The paths to multiple primer CSV/FASTA files.
#' @return A list containing objects of class \code{Primers}.
#' @keywords internal
read_primers_multiple <- function(filenames, fw.id, rev.id, merge.ambig,
                    max.degen, template.df, adapter.action, 
                    sample.name, updateProgress) {
    data <- lapply(filenames, function(x) read_primers_single(x, fw.id = fw.id, 
                    rev.id = rev.id, merge.ambig = merge.ambig,
                    max.degen = max.degen, template.df = template.df, 
                    adapter.action = adapter.action, sample.name = sample.name,
                    updateProgress = updateProgress))
    # annotate names of list/make 'Run' identifiers unique:
    data <- set.run.names(data, filenames)
    return(data)
}
set.run.names <- function(data, filenames = NULL) {
    # annotate names of list using set run identifiers
    if (length(data) == 0) {
        return(data)
    }
    if (length(filenames) != 0) {
        # filenames are available -> use stripped filename for empty files
        run.names <- sub("^([^.]*).*", "\\1", basename(filenames))
    } else {
        run.names <- rep("Unknown", length(data))
    }
    non.empty.idx <- which(unlist(lapply(data, nrow)) != 0)
    # use stored run identifier for non-empty files:
    run.names[non.empty.idx] <- sapply(data[non.empty.idx], function(x) x$Run[1])
    # ensure that run names are unique
    run.names <- uniqtag::uniqtag(run.names, k = Inf, make.unique)
    names(data) <- run.names
    # update run names in the loaded primer sets that aren't empty
    for (i in seq_along(non.empty.idx)) {
        data[[non.empty.idx[i]]]$Run <- run.names[non.empty.idx[i]]
    }
    return(data)
}
#' Read Primer CSV File.
#'
#' Reads a primer data frame stored in a CSV file.
#'
#' @param file The path to a csv file containing the primer data.
#' @return A \code{Primers} object, an instance of a data frame.
#' @keywords internal
read_primers_csv <- function(file) { 
    # turn some columns from integer to char if necessary 
    # (column with a single number will be interpreted as
    # numeric otherwise) ...
    if (!file.exists(file)) {
        stop(paste("Primer CSV file at '", file, "' could not be found."), sep = "")
    }
    # character columns:
    fix.cvg.cols.constrained <- c("Relative_Forward_Binding_Position_Start_fw", "Relative_Forward_Binding_Position_End_fw", 
        "Relative_Forward_Binding_Position_Start_rev", "Relative_Forward_Binding_Position_End_rev", 
        "Relative_Reverse_Binding_Position_Start_fw", "Relative_Reverse_Binding_Position_End_fw", 
        "Relative_Reverse_Binding_Position_Start_rev", "Relative_Reverse_Binding_Position_End_rev", 
        "Covered_Seqs", "Binding_Position_Start_rev", "Binding_Position_End_rev", 
        "Binding_Position_Start_fw", "Binding_Position_End_fw", "Binding_Region_Allowed", 
        "Nbr_of_mismatches_fw", "Nbr_of_mismatches_rev", "Binding_Region_Allowed_fw", 
        "Mismatch_pos_fw", "Mismatch_pos_rev", "Binding_Region_Allowed_rev")
    off.fix.cvg.cols.constrained <- paste0("Off_", fix.cvg.cols.constrained)
    fix.cvg.cols.constrained <- c(fix.cvg.cols.constrained, off.fix.cvg.cols.constrained)
    fix.cvg.cols.basic <- paste0("Basic_", fix.cvg.cols.constrained)
    fix.cvg.cols <- c(fix.cvg.cols.constrained, fix.cvg.cols.basic)
    fix.cols <- c(fix.cvg.cols,
                   # coverage constraints:
                   c("primer_efficiency", "T_EVAL_primer_efficiency", 
                     "terminal_mismatch_pos", "annealing_DeltaG",
                     "stop_codon", "substitution", "coverage_model",
                  "Forward", "Reverse", "constraints_passed_T", "Direction", "Run"))
    numeric.fix.cols <- c("primer_length_fw", "primer_length_rev",
                        "mean_primer_efficiency", "primer_specificity")
    factor.fix.cols <- c("ID", "Identifier")
    p <- try(read.csv(file, stringsAsFactors = FALSE, row.names = NULL), silent = TRUE)
    if (is(p, "try-error")) {
        msg <- paste0("The csv file: '", file, "' does not seem to represent a valid object of class 'Primers'",
                      " Please ensure the correct format of the file!")
        my.error("TemplateFormatIncorrect", msg)
    }
    for (i in seq_len(ncol(p))) {
        if (colnames(p)[i] %in% fix.cols) {
            p[, colnames(p)[i]] <- as.character(p[, colnames(p)[i]])
            na.idx <- which(is.na(p[, colnames(p)[i]]))
            if (length(na.idx) != 0) {
                # replace NA's in char cols with empty string:
                p[na.idx, colnames(p)[i]] <- ""
            }
        } else if (colnames(p)[i] %in% numeric.fix.cols) {
            p[, colnames(p)[i]] <- as.numeric(p[, colnames(p)[i]])
        }  else if (colnames(p)[i] %in% factor.fix.cols) {
            p[, colnames(p)[i]] <- factor(p[, colnames(p)[[i]]], 
                                    levels = p[, colnames(p)[[i]]])
        }
    }
    p <- try(Primers(p), silent = TRUE)
    if (is(p, "try-error")) {
        my.error("TemplateFormatIncorrect", 
            paste0("The loaded primer csv data from the file '",
                   file, "' did not represent a valid 'Primers' object.",
                   " Please check your input!"))
    }
    return(p)
}


#' Internal Function for Reading Primers
#'
#' Reads the given primer sequences into a data frame.
#'
#' @param primer.seqs Primer sequences.
#' @param headers Headers of the primer FASTA file.
#' @param fw.id Identifier of forward primers in the headers.
#' @param rev.id Identifier of reverse primers in the headers.
#' @param merge.ambig Should ambiguous primers be merged?
#' @param max.degen Maximum allowed degeneracy
#' @return A data frame with primer sequences.
#' @keywords internal
read_primers.internal <- function(primer.seqs, headers, fw.id, rev.id, merge.ambig = c("none", "merge", "unmerge"), 
    max.degen, sample.name) {
   
    merge.ambig <- match.arg(merge.ambig)
    mode <- "default"
    if (length(primer.seqs) == 0) {
        return(NULL)
    }
    if (fw.id == "" && rev.id == "") {
        # assume only fw primers are present
        fw.id <- "_fw"
        rev.id <- NA
        headers <- paste(headers, fw.id, sep = "")
        my.warning("NotifyPrimersMissingKeyword", "No keywords provided for primer directionalities. Primers were assumed to be sense primers.")
    } else if (fw.id == "" && rev.id != "") {
        # rev id is given -> assume complement of rev hits are forward primers
        my.warning("NotifyPrimersMissingKeyword", "A keyword was provided only for reverse primers. All primers without the reverse keyword were assumed to be forward primers.")
        mode <- "guess_forward"
        fw.id <- NA
    } else if (fw.id != "" && rev.id == "") {
        my.warning("NotifyPrimersMissingKeyword", "A keyword was provided only for forward primers. All primers without the forward keyword were assumed to be reverse primers.")
        mode <- "guess_reverse"
        rev.id <- NA
    }
    dirs <- rep(NA, length(headers))
    dir.fw.idx <- NULL
    dir.rev.idx <- NULL
    if (mode != "guess_forward") {
        dir.fw.idx <- grep(fw.id, headers, fixed = TRUE)
    } else {
        dir.fw.idx <- setdiff(seq_along(headers), grep(rev.id, headers, fixed = TRUE))
    }    
    if (mode != "guess_reverse") {
        dir.rev.idx <- grep(rev.id, headers, fixed = TRUE)
    } else {
        dir.rev.idx <- setdiff(seq_along(headers), grep(fw.id, headers, fixed = TRUE))
    }
    if (length(intersect(dir.fw.idx, dir.rev.idx) != 0)) {
        msg <- "Direction annotation is not distinct. Please use distinct keywords."
        my.error("NotifyPrimersNoDirection", msg)
    }
    dirs[dir.fw.idx] <- "Forward" # names of columns
    dirs[dir.rev.idx] <- "Reverse"
    use.idx <- seq_along(headers)
    if (any(is.na(dirs))) {
        warning.msg <- "Not all primer directions are known. Check your input primer fasta file for the correct direction keywords. Primers without known direction are ignored. The affected primers are:"
        p <- paste(headers[which(is.na(dirs))], collapse = ",")
        msg <- paste(warning.msg, p, sep = "")
        my.error("NotifyPrimersNoDirection", msg)
    }
    p.df <- data.frame(Header = headers, Direction = dirs, Sequence = primer.seqs, 
        Seq_Length = nchar(primer.seqs), stringsAsFactors = FALSE)[use.idx, ]
    rownames(p.df) <- NULL
    # try to match identifiers of primers
    h <- headers
    if (!is.na(fw.id)) {
        h <- sub(fw.id, "", headers[use.idx], fixed = TRUE)
    }
    if (!is.na(rev.id)) {
        h <- sub(rev.id, "", h, fixed = TRUE)
    }
    h <- sub(">", "", h)
    p.df$ID <- h
    # check that every sample has non-duplicate fw/rev primer
    unique.directions <- ddply(p.df, "ID", summarize, duplicate = any(duplicated(substitute(Direction))))
    if (any(unique.directions$duplicate)) {
        # duplicated directions found
        idx <- which(unique.directions$duplicate)
        warn <- paste("Found primers with duplicate directions! The following primers were affected:\n", 
            paste(unique.directions$ID[idx], collapse = "\n", sep = ""))
        my.error("PrimersDuplicateDirections", warn)
    }
    # n.b.: dcast changes variable order -> change back again
    ori.order <- p.df$ID
    p.df <- dcast(p.df, ID ~ Direction, value.var = "Sequence")  # wide-format :-)
    m <- match(p.df$ID, ori.order)
    p.df <- p.df[order(m),]
    # replace missing primers with ''
    p.df[is.na(p.df)] <- ""
    # if one direction is not present -> set it
    dir.mode <- "both"
    if (!"Forward" %in% colnames(p.df)) {
        dir.mode <- "rev"
        p.df$Forward <- ""
    }
    if (!"Reverse" %in% colnames(p.df)) {
        dir.mode <- "fw"
        p.df$Reverse <- ""
    }
    if (merge.ambig == "merge") {
        p.df <- merge.ambig.primers(p.df, dir.mode, max.degen)
        # message(p.df) disambiguate primers with ambiguities
    } else if (merge.ambig == "unmerge") {
        p.df <- disambiguate.primers(p.df)
    }
    # annotate direction of primers:
    directions <- ifelse(nchar(p.df$Forward) != 0 & nchar(p.df$Reverse) != 0, "both", 
                    ifelse(nchar(p.df$Forward) != 0, "fw", "rev"))
    p.df <- cbind(Identifier = paste(seq_len(nrow(p.df)), directions, sep = ""), 
        ID = p.df$ID, p.df[, c("Forward", "Reverse")], 
        primer_length_fw = nchar(p.df$Forward), 
		primer_length_rev = nchar(p.df$Reverse), 
        Direction = directions,
		Degeneracy_fw =  score_degen(strsplit(p.df$Forward, split = "")),
		Degeneracy_rev = score_degen(strsplit(p.df$Reverse, split = "")),
        Run = sample.name,
        stringsAsFactors = FALSE)
    sel <- which(p.df$Direction == "fw") 
    if (length(sel) != 0 && !is.na(fw.id)) {
        p.df$ID[sel] <- paste0(p.df$ID[sel], fw.id)
    }
    sel <- which(p.df$Direction == "rev") 
    if (length(sel) != 0 && !is.na(rev.id)) {
        p.df$ID[sel] <- paste0(p.df$ID[sel], rev.id)
    }
    p.df$ID <- factor(p.df$ID, levels = p.df$ID)
    p.df$Identifier <- factor(p.df$Identifier, levels = p.df$Identifier)
    p.df <- Primers(p.df) # create Primers object
    return(p.df)
}
#' Disambiguation of Primers.
#'
#' Disambiguates ambiguous primer sequences into all possible sequences.
#'
#' @param p.df Primer data frame.
#' @return Data frame with disambiguated primers.
#' @keywords internal
disambiguate.primers <- function(p.df) {
    f <- convert.from.iupac(p.df$Forward)
    r <- convert.from.iupac(p.df$Reverse)
    # combine
    combis <- lapply(seq_along(f), function(x) expand.grid(f[[x]], r[[x]], stringsAsFactors = FALSE))
    identifiers <- unlist(lapply(seq_along(combis), function(x) paste(p.df$ID[x], 
        "_", seq_len(nrow(combis[[x]])), sep = "")))
    df <- do.call(rbind, combis)
    new.df <- data.frame(ID = identifiers, Forward = df[, 1], Reverse = df[, 2], 
        stringsAsFactors = FALSE)
    # message(new.df) message(class(new.df$Forward))
    return(new.df)
}
#' Pairing of Forward and Reverse Primers.
#'
#' Pairs forward and reverse primers such that coverage is maximized 
#' for every pair.
#'
#' @param primer.df An object of class \code{Primers}.
#' @return An object of class \code{Primers} containing the paired primers.
#' @keywords internal
pair_primers <- function(primer.df, template.df) {
    fw.idx <- which(primer.df$Forward != "")
    rev.idx <- which(primer.df$Reverse != "")
    if (length(fw.idx) == 0 || length(rev.idx) == 0) {
        return(primer.df) # nothing to pair
    }
    if (!"primer_coverage" %in% colnames(primer.df)) {
        stop("pair_primers requires primer coverage.")
    }
    # store already paired primers in 'both.df' for later output
    both.df <- primer.df[primer.df$Direction == "both",]
    # try to pair all single forward/reverse primers
    fw.s.idx <- which(primer.df$Direction == "fw")
    rev.s.idx <- which(primer.df$Direction == "rev")
    # combinations of fw and rev primers
    combis <- expand.grid(fw.s.idx, rev.s.idx)
    if (nrow(combis) != 0) {
        # determine intersection between all combinations
        cvd <- covered.seqs.to.idx(primer.df[, "Covered_Seqs"], template.df)
        cvd.shared <- lapply(seq_len(nrow(combis)), 
            function(x) intersect(cvd[[combis[x,1]]], cvd[[combis[x,2]]])
        )
        cvg.shared <- sapply(cvd.shared, length)
        combis$Coverage <- cvg.shared
        # exclude pairs that don't share any coverage
        sel.combis <- which(combis$Coverage != 0)
        combis <- combis[sel.combis,]
        # construct a new primer data frame using the pairs
        pair.df <- primer.df[combis[,1],]
        pair.df$Reverse <- primer.df$Reverse[combis[,2]]
        pair.df$primer_coverage <- combis$Coverage
        pair.df$Coverage_Ratio <- pair.df$primer_coverage / nrow(template.df)
        pair.df$Covered_Seqs <- unlist(lapply(cvd.shared[sel.combis], function(x) paste(template.df$Identifier[x], collapse = ",")))
        pair.df$Direction <- rep("both", nrow(combis))
        pair.df$ID <- factor(paste0(abbreviate(primer.df$ID[combis[,1]], 5), "+", abbreviate(primer.df$ID[combis[,2]], 5)))
        # select non-redundant pairs of primers:
        cvg.matrix <- get.coverage.matrix(pair.df, template.df)
        sel.cols <- remove.redundant.cols(seq_len(nrow(pair.df)), cvg.matrix)
        pair.df <- pair.df[sel.cols,]
    } else {
        pair.df <- NULL
    }
    # output the already paired primers as well as the new pairs
    out.df <- my_rbind(both.df, pair.df)
    ################
    # when pairing, we exclude primers that weren't paired such that only paired primers remained. otherwise we have problems with the subset coverage!!
    # check whether some primers couldn't be paired
    #missing.idx <- which(unlist(lapply(seq_len(nrow(primer.df)), function(x) all(!grepl(primer.df$ID[x], out.df$ID)))))
    #out.df <- my_rbind(primer.df[missing.idx, ], out.df)
    #################
    return(out.df)
}

#' @rdname Output
#' @name Output
#' @aliases write_primers
#' @return \code{write_primers} stores primers to disk.
#' @export
#' @examples
#' 
#' data(Ippolito)
#' # Store primers as FASTA
#' fname.fasta <- tempfile("my_primers", fileext = ".fasta")
#' write_primers(primer.df, fname.fasta)
#' # Store primers as CSV
#' fname.csv <- tempfile("my_primers", fileext = ".csv")
#' write_primers(primer.df, fname.csv, "CSV")
write_primers <- function(primer.df, fname, ftype = c("FASTA", "CSV")) {
    ftype <- match.arg(ftype)
    if (ftype == "FASTA") {
        fw.idx <- which(primer.df$Forward != "")
        rev.idx <- which(primer.df$Reverse != "")
        seqs <- c(primer.df$Forward[fw.idx], primer.df$Reverse[rev.idx])
        labels <- c(as.character(primer.df$ID[fw.idx]), as.character(primer.df$ID[rev.idx]))
        seqinr::write.fasta(as.list(seqs), as.list(labels), fname)
    } else if (ftype == "CSV") {
        write.csv(primer.df, file = fname, row.names = FALSE)
    } else {
        warning("Unknown filetype: ", ftype)
    }
}
#' Direction of Primers.
#'
#' Identifies the directionality of the input primers.
#'
#' @param primers A primer data frame.
#' @return \code{both} if both, forward and reverse primers exist in \code{primers}.
#' Otherwise, if either only forward primers or reverse primers exist, returns \code{fw}
#' or \code{rev}, respectively.
#' @keywords internal
get.analysis.mode <- function(primers) {
    if (length(primers) == 0) {
        return(NULL)
    }
    mode <- "both"
    if (all(primers$Forward == "" | is.na(primers$Forward))) {
        mode <- "rev"
    } else if (all(primers$Reverse == "" | is.na(primers$Reverse))) {
        mode <- "fw"
    }
    return(mode)
}

#' @rdname AnalysisStats
#' @name AnalysisStats
#' @aliases get_cvg_ratio
#' @details
#' The manner in which \code{get_cvg_ratio} determines the coverage ratio 
#' depends on the directionality of the input primers.
#' If either only forward or reverse primers are inputted, the individual
#' coverage of each primer is used to determine the overall coverage. 
#' If, however, forward and reverse primers are inputted at the same time, 
#' the coverage is defined by the intersection of binding events from both,
#' forward and reverse primers.
#'
#' @return By default, \code{get_cvg_ratio} returns a numeric providing 
#' the expected primer coverage ratio.
#' If \code{as.char} is \code{TRUE}, the output is provided as a
#' percentage-formatted character vector. 
#' The attributes \code{no_covered}, \code{no_templates}, 
#' and \code{covered_templates} provide the number of covered templates, 
#' the total number of templates, and the IDs of covered templates, respectively.
#' @export
#' @examples
#' 
#' data(Ippolito)
#' # Determine the overall coverage
#' cvg.ratio <- get_cvg_ratio(primer.df, template.df)
#' # Determine the identitity coverage ratio 
#' cvg.ratio.0 <- get_cvg_ratio(primer.df, template.df, allowed.mismatches = 0)
get_cvg_ratio <- function(primer.df, template.df, allowed.mismatches = NULL, 
                        cvg.definition = c("constrained", "basic"), 
                        mode.directionality = NULL, as.char = FALSE) {
    if (length(primer.df) == 0 || nrow(primer.df) == 0) {
        # no coverage if there's no input.
        return(0.0)
    }
    if (!is(primer.df, "Primers")) {
        stop("Please input a primer data frame.")
    }
    if (!is(template.df, "Templates")) {
        stop("Please provide a valid template data frame.")
    }
    if (!"Covered_Seqs" %in% colnames(primer.df) || length(template.df) == 0) {
        return(NA)
    }
    cvg.definition <- match.arg(cvg.definition)
    # check whether primers and templates correspond to one another
    if (!check_correspondence(primer.df, template.df)) {
        msg <- paste0("Could not match all coverage events to the input template data frame.\n",
            "Please verify whether the primer coverage was actually computed for the input template data frame.")
        stop(msg)
    }
    if (length(mode.directionality) == 0) {
        # determine directionality
        mode.directionality <- get.analysis.mode(primer.df)
    } else {
        # check input mode
        if (!mode.directionality %in% c("fw", "rev", "both")) {
            stop("Unknown mode directionality for cvg_ratio")
        }
    }
    if (cvg.definition == "constrained" && length(allowed.mismatches) == 0) {
        # use the vanilla coverage definition: faster and doesn't require mismatch information
        cvd <- get_covered.vanilla(primer.df, template.df, mode.directionality)
    } else {
        cvg.data <- get_template_cvg_data(primer.df, template.df)
        # select only coverage events from the selected cvg definition
        cvg.data <- cvg.data[cvg.data$Status == cvg.definition, ]
        # retrieve only the allowed coverage events
        if (length(allowed.mismatches) != 0) {
            # select only events with less or equal the allowed number of mismatches
            cvg.data <- cvg.data[cvg.data$Number_of_mismatches <= allowed.mismatches,]
        } 
        cvd <- unique(cvg.data$Template)
    }
    max.cvg <- length(cvd) / nrow(template.df)  # maximum theoretical coverage
    if (as.char) {
        max.cvg <- paste0(round(max.cvg *100, 2), "%")
    }
    # add number of templates covered and total number of templates as attributes
    attr(max.cvg, "no_covered") <- length(cvd)
    attr(max.cvg, "no_templates") <- nrow(template.df)
    attr(max.cvg, "covered_templates") <- as.character(cvd)
    return(max.cvg)
}

#' Determination of the Covered Sequences.
#'
#' Determines the covered template sequences given by \code{template.df} 
#' that are covered by the primers given by \code{primers}.
#'
#' The manner in which the coverage ratio is evaluated depends on 
#' the directionality of the input primers.
#' If either only forward or reverse primers are inputted, the individual
#' coverage of each primer is used to determine the overall coverage. If, however,
#' forward and reverse primers are inputted at the same time, 
#' the coverage is defined by the intersection of binding events from both,
#' forward and reverse primers.
#'
#' @param primers A \code{Primers} object containing the primers
#' for which the coverage should be evaluated.
#' @param template.df A \code{Templates} object containing
#' the template sequences corresponding to \code{primers}. 
#' @param mode.directionality If \code{mode.directionality} is provided,
#' the coverage of templates is computed for a specific direction of primers.
#' Either "fw" (forward coverage only), "rev" (reverse coverage only), or "both" for both directions. If \code{mode.directionality} is not provided
#' the direction is determined by the input primers.
#' @return The IDs of all covered templates.
#' @keywords internal
get_covered.vanilla <- function(primers, template.df, mode.directionality = NULL) {
    if (length(primers) == 0 || nrow(primers) == 0) {
        # no coverage if there's no input.
        return(0.0)
    }
    if (!is(primers, "Primers")) {
        stop("Please input a primer data frame.")
    }
    if (!is(template.df, "Templates")) {
        stop("Please provide a valid template data frame.")
    }
    if (!"Covered_Seqs" %in% colnames(primers) || length(template.df) == 0) {
        return(NA)
    }
    # check whether primers and templates correspond to one another
    if (!check_correspondence(primers, template.df)) {
        msg <- paste0("Could not match all coverage events to the input template data frame.\n",
            "Please verify whether the primer coverage was actually computed for the input template data frame.")
        stop(msg)
    }
    fw.idx <- which(primers$Forward != "")
    rev.idx <- which(primers$Reverse != "")
    if (length(mode.directionality) == 0) {
        # determine directionality
        mode.directionality <- get.analysis.mode(primers)
    } else {
        # check input mode
        if (!mode.directionality %in% c("fw", "rev", "both")) {
            stop("Unknown mode directionality for cvg_ratio")
        }
    }
    primer.df <- primers
    if (mode.directionality == "both") {
        # fw and rev primers -> intersection of fw and rev cvg events gives the total
        # coverage
        hits.fw <- unique(unlist(covered.seqs.to.idx(primer.df$Covered_Seqs[fw.idx], 
            template.df)))
        hits.rev <- unique(unlist(covered.seqs.to.idx(primer.df$Covered_Seqs[rev.idx], 
            template.df)))
        cvd <- intersect(hits.fw, hits.rev)  # idx of covered templates
    } else if (mode.directionality == "fw") {
        # only fw primers direction
        cvd <- unique(unlist(covered.seqs.to.idx(primer.df$Covered_Seqs[fw.idx], template.df)))
    } else {
        cvd <- unique(unlist(covered.seqs.to.idx(primer.df$Covered_Seqs[rev.idx], template.df)))
    }
    # convert to template IDs
    cvd <- template.df$ID[cvd]
    return(cvd)
}

Try the openPrimeR package in your browser

Any scripts or data that you put into this service are public.

openPrimeR documentation built on Nov. 16, 2020, 2 a.m.