R/geneFreq.R

Defines functions geneFreq

Documented in geneFreq

#' Gene frequencies
#' 
#' Creates a data frame of VDJ gene counts and frequencies.
#' 
#' @param productive.nt A list of one or more data frames of productive sequences 
#' generated by the LymphoSeq function productiveSeq where the parameter 
#' aggregate is set to "nucleotide".
#' @param locus A character vector indicating which VDJ genes to include in the 
#' output.  Available options include "VDJ", "DJ", "VJ", "DJ", "V", "D", or "J".
#' @param family A Boolean value indicating whether or not family names instead 
#' of gene names are used.  If TRUE, then family names are used and if FALSE, 
#' gene names are used.
#' @return Returns a data frame with the sample names, VDJ gene name, count, and 
#' \% frequency of the V, D, or J genes (each gene frequency should add to 
#' 100\% for each sample).
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path)
#' 
#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
#' 
#' geneFreq(productive.nt, locus = "VDJ", family = FALSE)
#' 
#' # Create a heat map of V gene usage
#' vfamilies <- geneFreq(productive.nt, locus = "V", family = TRUE)
#' 
#' require(reshape)
#' 
#' vfamilies <- reshape::cast(vfamilies, familyName ~ samples, value = "frequencyGene", sum)
#' 
#' rownames(vfamilies) <- as.character(vfamilies$familyName)
#' 
#' vfamilies$familyName <- NULL
#' 
#' RedBlue <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdBu")))(256)
#' 
#' require(pheatmap)
#' 
#' pheatmap::pheatmap(vfamilies, color = RedBlue, scale = "row")
#' 
#' # Create a word cloud of V gene usage
#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE)
#' 
#' require(wordcloud)
#' 
#' wordcloud::wordcloud(words = vgenes[vgenes$samples == "TRB_Unsorted_83", "geneName"], 
#'    freq = vgenes[vgenes$samples == "TRB_Unsorted_83", "frequencyGene"], 
#' 	  colors = RedBlue)
#' 
#' # Create a cumulative frequency bar plot of V gene usage
#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE)
#' 
#' require(ggplot2)
#' 
#' ggplot2::ggplot(vgenes, aes(x = samples, y = frequencyGene, fill = geneName)) +
#'   geom_bar(stat = "identity") +
#'   theme_minimal() + 
#'   scale_y_continuous(expand = c(0, 0)) + 
#'   guides(fill = guide_legend(ncol = 3)) +
#'   labs(y = "Frequency (%)", x = "", fill = "") +
#'   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
#' @export
#' @importFrom stats aggregate
geneFreq <- function(productive.nt, locus = "VDJ", family = FALSE) {
    gene.names <- data.frame()
    if (family == FALSE) {
        i <- 1
        for (i in 1:length(productive.nt)) {
            file <- productive.nt[[i]]
            file[file == ""] <- "unresolved"
            name <- names(productive.nt)
            v <- data.frame()
            d <- data.frame()
            j <- data.frame()
            if (grepl("V|v", locus)) {
                v <- aggregate(count ~ vGeneName, data = file, FUN = sum)
                v <- data.frame(samples = name[i], geneName = v$vGeneName, 
                                count = v$count, 
                                frequencyGene = v$count/sum(v$count) * 100)
            }
            if (grepl("D|d", locus)) {
                d <- aggregate(count ~ dGeneName, data = file, FUN = sum)
                d <- data.frame(samples = name[i], geneName = d$dGeneName, 
                                count = d$count, 
                                frequencyGene = d$count/sum(d$count) * 100)
            }
            if (grepl("J|j", locus)) {
                j <- aggregate(count ~ jGeneName, data = file, FUN = sum)
                j <- data.frame(samples = name[i], geneName = j$jGeneName, 
                                count = j$count, 
                                frequencyGene = j$count/sum(j$count) * 100)
            }
            gene.names <- rbind(v, d, j, gene.names)
        }
    } else {
        i <- 1
        for (i in 1:length(productive.nt)) {
            file <- productive.nt[[i]]
            file[file == ""] <- "unresolved"
            name <- names(productive.nt)
            v <- data.frame()
            d <- data.frame()
            j <- data.frame()
            if (grepl("V|v", locus)) {
                v <- aggregate(count ~ vFamilyName, data = file, FUN = sum)
                v <- data.frame(samples = name[i], familyName = v$vFamilyName, 
                                count = v$count, 
                                frequencyGene = v$count/sum(v$count) * 100)
            }
            if (grepl("D|d", locus)) {
                d <- aggregate(count ~ dFamilyName, data = file, FUN = sum)
                d <- data.frame(samples = name[i], familyName = d$dFamilyName, 
                                count = d$count, 
                                frequencyGene = d$count/sum(d$count) * 100)
            }
            if (grepl("J|j", locus)) {
                j <- aggregate(count ~ jFamilyName, data = file, FUN = sum)
                j <- data.frame(samples = name[i], familyName = j$jFamilyName, 
                                count = j$count, 
                                frequencyGene = j$count/sum(j$count) * 100)
            }
            gene.names <- rbind(v, d, j, gene.names)
        }
    }
    return(gene.names)
}

Try the LymphoSeq package in your browser

Any scripts or data that you put into this service are public.

LymphoSeq documentation built on Nov. 8, 2020, 8:09 p.m.