R/phyloTree.R

Defines functions phyloTree

Documented in phyloTree

#' Create phylogenetic tree
#' 
#' Create a phylogenetic tree using neighbor joining tree estimation for amino acid or 
#' nucleotide CDR3 sequences in a list of data frames.
#' 
#' @param list A list data frames of unproductive nucleotide sequences or productive 
#' nucleotide sequences generated by the LymphoSeq function productiveSeq.  
#' vFamilyName, dFamilyName, jFamilyName, and count are required columns.
#' @param sample A character vector indicating the name of the sample in the productive
#' sequence list. 
#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
#' should be compared.
#' @param layout A character vector indicating the tree layout.  Options include 
#' "rectangular", "slanted", "fan", "circular", "radial" and "unrooted".
#' @param label A Boolean indicating if the sequencing count should be shown next to the leaves.
#' @return Returns a phylogenetic tree where each leaf represents a sequence color coded by the
#' V, D, and J gene usage.  The number next to each leaf refers to the sequence count.  A triangle 
#' shaped leaf indicates the dominant sequence.  Refer to the ggtree Bioconductor package 
#' documentation for details on how to manipulate the tree.
#' @examples
#' file.path <- system.file("extdata", "IGH_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path)
#' 
#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
#' 
#' phyloTree(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "nucleotide", 
#'          layout = "rectangular")
#' 
#' phyloTree(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "aminoAcid", 
#'          layout = "circular")
#'          
#' # Add scale and title to figure
#' library(ggtree)
#' library(ggplot2)
#' phyloTree(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "aminoAcid", 
#'          layout = "rectangular") +
#'          ggtree::theme_tree2() +
#'          ggplot2::theme(legend.position = "right", legend.key = element_rect(colour = "white")) +
#'          ggplot2::ggtitle("Title")
#'          
#' # Hide legend and leaf labels
#' phyloTree(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "nucleotide", 
#'          layout = "rectangular", label = FALSE) +
#'          ggplot2::theme(legend.position="none")
#'          
#' @export
#' @import ggtree
#' @import ggplot2
#' @importFrom phangorn NJ
phyloTree <- function(list, sample, type = "nucleotide", layout = "rectangular", label = TRUE) {
    file <- list[[sample]]
    if(length(grep(pattern = "vFamilyName|dFamilyName|jFamilyName|count", names(file))) != 4){
        stop("vFamilyName, dFamilyName, jFamilyName, nucleotide, and count are required columns.  Try aggregating the sequences by 'nucleotide' usng the function productiveSeq.", call. = FALSE)
    }
    if(nrow(file) < 3){
        stop("Cannot draw phlogenetic tree with less than 3 sequences.", call. = FALSE)
    }
    file[file == ""] <- "unresolved"
    if(type == "nucleotide") {
        file <- file[nchar(file$nucleotide) > 9, ]
        distance <- adist(file$nucleotide, file$nucleotide)
        colnames(distance) <- rownames(distance) <- file$nucleotide
        names <- file$nucleotide
    }
    if(type == "aminoAcid") {
        file <- file[nchar(file$aminoAcid) > 3, ]
        distance <- adist(file$aminoAcid, file$aminoAcid)
        colnames(distance) <- rownames(distance) <- file$aminoAcid
        names <- file$aminoAcid
    }
    tree <- phangorn::NJ(distance)
    geneFamilies <- paste(file$vFamilyName, file$dFamilyName, file$jFamilyName)
    geneFamilies <- gsub(geneFamilies, pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "")
    geneFamilies <- gsub(geneFamilies, pattern = "unresolved", replacement = "UNR")
    tree.annotation <- data.frame(names = names, count = file$count, 
                                  geneFamilies = geneFamilies, aminoAcid = file$aminoAcid,
                                  frequencyCount = round(file$frequencyCount, digits = 2), 
                                  dominant = c("Yes", rep("No", nrow(file) - 1)))
    getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
    plot <- ggtree::ggtree(tree, layout = layout) %<+% tree.annotation + 
        ggtree::geom_tippoint(aes_string(color = "geneFamilies", shape = "dominant"), size = 3) + 
        ggplot2::scale_color_manual(values = getPalette(length(unique(geneFamilies)))) + 
        ggplot2::guides(shape = FALSE) + 
        ggplot2::theme(legend.position = "bottom", legend.key = element_rect(colour = "white")) + 
        ggplot2::labs(color = "")
    if(label == TRUE){
        plot <- plot + ggplot2::geom_text(aes_string(label="count"), nudge_x = 0.5, hjust = 0, size = 2.5, na.rm = TRUE, check_overlap = TRUE)
    }
    return(plot)
}
davidcoffey/LymphoSeq documentation built on Dec. 31, 2019, 9:52 p.m.