#' Productive sequences
#'
#' Remove unproductive CDR3 sequences from a single data frame.
#'
#' @param sample A data frame consisting of antigen receptor sequencing data.
#' "aminoAcid", "count", and "frequencyCount" are required columns.
#' @param aggregate Indicates whether the values of "count", "frequencyCount",
#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide
#' sequence. Acceptable values are "aminoAcid" or "nucleotide". If "aminoAcid"
#' is selected, then the resulting data frame will have columns corresponding to
#' "aminoAcid", "count", "frequnecyCount", and "estimatedNumberGenomes" (if this
#' column is available). If "nucleotide" is selected then all columns in the
#' original data frame will be present in the outputted data frame. The difference in
#' output is due to the fact that the same amino acid CDR3 sequence may be
#' encoded by multiple unique nucleotide sequences with differing V, D, and J
#' genes.
#' @return Returns a data frame of productive amino acid sequences with recomputed
#' values for "count", "frequencyCount", and "esimatedNumberGenomes". A
#' productive sequences is defined as a sequence that is in frame and does not
#' have an early stop codon.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#'
#' file.list <- readImmunoSeq(path = file.path)
#'
#' productive <- productive(sample = file.list[["TRB_Unsorted_32"]], aggregate = "aminoAcid")
#' @seealso \code{\link{productiveSeq}}
#' @export
#' @importFrom stats aggregate
productive <- function(sample, aggregate = "aminoAcid") {
productive.seqs <- sample[which(sample[, "aminoAcid"] != "" &
!grepl("\\*", sample[, "aminoAcid"])), ]
if(nrow(productive.seqs) != 0){
if (any(grepl("estimatedNumberGenomes", colnames(productive.seqs)))) {
if (aggregate == "aminoAcid") {
count <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
productive.seqs$estimatedNumberGenomes <- suppressWarnings(as.integer(productive.seqs$estimatedNumberGenomes))
productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
productive.seqs$estimatedNumberGenomes[is.na(as.integer(productive.seqs$estimatedNumberGenomes))] <- 0
estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ aminoAcid,
data = productive.seqs, FUN = sum)
merged <- merge(count, estimatedNumberGenomes, by = "aminoAcid")
merged$frequencyCount <- merged$count/sum(count$count) * 100
}
if (aggregate == "nucleotide") {
count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
productive.seqs$estimatedNumberGenomes <- suppressWarnings(as.integer(productive.seqs$estimatedNumberGenomes))
productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
productive.seqs$estimatedNumberGenomes[is.na(as.integer(productive.seqs$estimatedNumberGenomes))] <- 0
estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide,
data = productive.seqs, FUN = sum)
productive.seqs$count <- NULL
productive.seqs$estimatedNumberGenomes <- NULL
merged <- Reduce(function(x, y) merge(x, y, by = "nucleotide"),
list(productive.seqs, count, estimatedNumberGenomes))
merged$frequencyCount <- merged$count/sum(count$count) * 100
}
} else {
if (aggregate == "aminoAcid") {
merged <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
merged$frequencyCount <- merged$count/sum(merged$count) * 100
}
if (aggregate == "nucleotide") {
count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
productive.seqs$count <- NULL
merged <- merge(productive.seqs, count, by = "nucleotide")
merged$frequencyCount <- merged$count/sum(merged$count) * 100
}
}
merged <- merged[order(merged$frequencyCount, decreasing = TRUE), intersect(names(sample), names(merged))]
rownames(merged) <- NULL
return(merged)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.