R/productive.R
In LymphoSeq: Analyze high-throughput sequencing of T and B cell receptors

Documented in productive

#' Productive sequences
#' 
#' Remove unproductive CDR3 sequences from a single data frame.
#' 
#' @param sample A data frame consisting of antigen receptor sequencing data.  
#' "aminoAcid", "count", and "frequencyCount" are required columns.
#' @param aggregate Indicates whether the values of "count", "frequencyCount", 
#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide 
#' sequence.  Acceptable values are "aminoAcid" or "nucleotide".  If "aminoAcid" 
#' is selected, then the resulting data frame will have columns corresponding to 
#' "aminoAcid", "count", "frequnecyCount", and "estimatedNumberGenomes" (if this 
#' column is available).  If "nucleotide" is selected then all columns in the 
#' original data frame will be present in the outputted data frame.  The difference in 
#' output is due to the fact that the same amino acid CDR3 sequence may be 
#' encoded by multiple unique nucleotide sequences with differing V, D, and J 
#' genes. 
#' @return Returns a data frame of productive amino acid sequences with recomputed 
#' values for "count", "frequencyCount", and "esimatedNumberGenomes".  A 
#' productive sequences is defined as a sequence that is in frame and does not 
#' have an early stop codon.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path)
#' 
#' productive <- productive(sample = file.list[["TRB_Unsorted_32"]], aggregate = "aminoAcid")
#' @seealso \code{\link{productiveSeq}}
#' @export
#' @importFrom stats aggregate
productive <- function(sample, aggregate = "aminoAcid") {
    productive.seqs <- sample[which(sample[, "aminoAcid"] != "" &
                                        !grepl("\\*", sample[, "aminoAcid"])), ]
    if(nrow(productive.seqs) != 0){
        if (any(grepl("estimatedNumberGenomes", colnames(productive.seqs)))) {
            if (aggregate == "aminoAcid") {
                count <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
                productive.seqs$estimatedNumberGenomes <- suppressWarnings(as.integer(productive.seqs$estimatedNumberGenomes))
                productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
                productive.seqs$estimatedNumberGenomes[is.na(as.integer(productive.seqs$estimatedNumberGenomes))] <- 0
                estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ aminoAcid, 
                                                    data = productive.seqs, FUN = sum)
                merged <- merge(count, estimatedNumberGenomes, by = "aminoAcid")
                merged$frequencyCount <- merged$count/sum(count$count) * 100
            }
            if (aggregate == "nucleotide") {
                count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
                productive.seqs$estimatedNumberGenomes <- suppressWarnings(as.integer(productive.seqs$estimatedNumberGenomes))
                productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
                productive.seqs$estimatedNumberGenomes[is.na(as.integer(productive.seqs$estimatedNumberGenomes))] <- 0
                estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide, 
                                                    data = productive.seqs, FUN = sum)
                productive.seqs$count <- NULL
                productive.seqs$estimatedNumberGenomes <- NULL
                merged <- Reduce(function(x, y) merge(x, y, by = "nucleotide"), 
                                 list(productive.seqs, count, estimatedNumberGenomes))
                merged$frequencyCount <- merged$count/sum(count$count) * 100
            }
        } else {
            if (aggregate == "aminoAcid") {
                merged <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
                merged$frequencyCount <- merged$count/sum(merged$count) * 100
            }
            if (aggregate == "nucleotide") {
                count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
                productive.seqs$count <- NULL
                merged <- merge(productive.seqs, count, by = "nucleotide")
                merged$frequencyCount <- merged$count/sum(merged$count) * 100
            }
        }
        merged <- merged[order(merged$frequencyCount, decreasing = TRUE), intersect(names(sample), names(merged))]
        rownames(merged) <- NULL
        return(merged)
    }
}