R/mutTrunkBranch.R

Defines functions mutTrunkBranch

Documented in mutTrunkBranch

#' mutTrunkBranch
#' @description Summarize and conduct paired Fisher test of mutations of trunk/branches in a phylogenetic tree.
#' 
#' @param phyloTree phyloTree or phyloTreeList object generated by \code{\link{getPhyloTree}} function.
#' @param patient.id Select the specific patients. Default NULL, all patients are included
#' @param CT Distinction between C>T at CpG and C>T at other sites. (Default: FALSE).
#' @param pvalue Confidence level of the interval for Fisher test. Default 0.05.
#' @param plot Logical. (Default: TRUE).
#' 
#' @examples
#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit")
#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit")
#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit")
#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19")
#' 
#' ## Load a reference genome.
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' 
#' phyloTree <- getPhyloTree(maf, patient.id = 'V402')
#' mutTrunkBranch(phyloTree, plot = TRUE)
#' 
#' @return  a list of box plots based on mutational categories
#' @importFrom data.table data.table
#' @importFrom stats fisher.test
#' @export mutTrunkBranch

mutTrunkBranch <- function(phyloTree,
                           patient.id = NULL,
                           CT = FALSE,
                           pvalue = 0.05,
                           plot = TRUE){
    
    
    processMTB <- function(phyloTree){
        patient <- getPhyloTreePatient(phyloTree)

        ## get trinucleotide matrix
        tsb.label <- getPhyloTreeTsbLabel(phyloTree)
        if(nrow(tsb.label) > 0){
            tri_matrix <- tri_matrix_list[[patient]]$tri_matrix
        }else{
            tri_matrix <- tri_matrix_list[[patient]]
        }
        
        ## set groups
        if(CT){
            group_level <- c("C>A","C>G","C>T at CpG","C>T other","T>A","T>C","T>G")
        }else{
            group_level <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")
        }
        
        ## separate trunk and branch data
        tri_matrix_trunk <- tri_matrix["Trunk", ]
        tri_matrix_branch <- tri_matrix["Branches", ]
        
        if(is(tri_matrix_branch, "matrix")){
            tri_matrix_branch <- colSums(tri_matrix_branch)
        }
        
        tri_matrixBT <- rbind(Trunk=tri_matrix_trunk, Branch=tri_matrix_branch)
        
        ## mutation spectrum
        if(CT){
            BT_spectrum <- data.frame(Type = colnames(tri_matrixBT), t(tri_matrixBT)) %>% 
                dplyr::rowwise() %>% 
                dplyr::mutate(Group = dplyr::case_when(
                    grepl("CpG",.data$Type) ~  
                        "C>T at CpG",
                    grepl("other",.data$Type) ~  
                        "C>T other",
                    !grepl("CpG|other",.data$Type) ~ 
                        paste(strsplit(as.character(.data$Type) ,"")[[1]][3:5], collapse = "")
                )) %>%
                as.data.frame() %>% 
                tidyr::pivot_longer(
                    cols = c("Trunk","Branch"),
                    names_to = "BT",
                    values_to = "mut_num")  
        }else{
            BT_spectrum <- data.frame(Type = colnames(tri_matrixBT), t(tri_matrixBT)) %>% 
                dplyr::rowwise() %>% 
                dplyr::mutate(Group = paste(strsplit(as.character(.data$Type) ,"")[[1]][3:5], collapse = "")) %>% 
                as.data.frame()%>% 
                tidyr::pivot_longer(
                    cols = c("Trunk","Branch"),
                    names_to = "BT",
                    values_to = "mut_num")  
        }
        # print(unique(BT_spectrum$Type))
        ## fisher test
        df_pvalue_list <- lapply(group_level, function(group){
            b1 <- sum(BT_spectrum[BT_spectrum$Group == group &  BT_spectrum$BT == "Branch", ]$mut_num) 
            b2 <- sum(BT_spectrum[BT_spectrum$Group != group &  BT_spectrum$BT == "Branch", ]$mut_num)
            t1<-  sum(BT_spectrum[BT_spectrum$Group == group & BT_spectrum$BT == "Trunk", ]$mut_num) 
            t2 <-  sum(BT_spectrum[BT_spectrum$Group != group & BT_spectrum$BT == "Trunk", ]$mut_num) 
            if(all(!is.nan(b1)) & all(!is.nan(t1))){
                m <- matrix(c(b1, t1, b2, t2),ncol = 2)
                pValue <- stats::fisher.test(m,alternative = "two.sided")$p.value
            }
            else{
                pValue <- NA
            }
            sub <- data.frame(group, pValue)
            colnames(sub) <- c("Group", "P_Value")
            return(sub)
        })
        df_pvalue <- dplyr::bind_rows(df_pvalue_list) %>% as.data.table()
        
        ## generate output data.frame with quantity of mutations in different categories
        
        BT_summary <-  BT_spectrum %>% 
            dplyr::group_by(.data$BT, .data$Group) %>% 
            dplyr::summarise(mut_num = sum(.data$mut_num)) %>% 
            tidyr::pivot_wider(names_from = "BT",
                               values_from = "mut_num") %>% 
            merge(df_pvalue,by = "Group") %>% 
            dplyr::mutate(Patient_ID = patient) %>% 
            dplyr::select("Patient_ID", "Group", "Trunk", "Branch", "P_Value")
        
        if(plot){
            trunk_num <- sum(BT_summary$Trunk)
            branch_num <- sum(BT_summary$Branch)
            if(sum(BT_summary$Trunk) == 0){
                BT_summary$Trunk <- 0
            }else{
                BT_summary$Trunk <- BT_summary$Trunk/sum(BT_summary$Trunk)  
            }
            if(sum(BT_summary$Branch) == 0){
                BT_summary$Branch <-  0
            }else{
                BT_summary$Branch <- BT_summary$Branch/sum(BT_summary$Branch)
            }
            
            ## set data table for bar plot
            dat <- BT_summary %>% 
                tidyr::pivot_longer(cols = c("Trunk","Branch"),
                                    names_to = "BT",
                                    values_to = "fraction") %>%
                as.data.frame() 
            dat$Group <- factor(dat$Group, levels = unique(group_level))
            dat <- dplyr::arrange(dat, dplyr::desc(.data$Group))
            
            ## get position
            dat <- as.data.table(dat)
            dat$rect.xmin <- 0
            dat$rect.xmax <- 0
            dat$rect.ymin <- 0
            dat$rect.ymax <- 0
            i <- 0
            
            i_list1 <- seq(0, 0.15*(length(unique(dat$BT))-1), 0.15)
            i_list2 <- i_list1 + 0.1
            
            
            d_list <- lapply(seq_len(length(unique(dat$BT))), function(i){
                bt <- unique(dat$BT)[i]
                d <- dat[dat$BT == bt,]
                d$rect.xmin <- i_list1[i]
                d$rect.xmax <- i_list2[i]
                ## get position for y axis
                yy <- cumsum(d$fraction)
                len.yy <- length(yy)
                d$rect.ymin <- c(0,yy[-len.yy])
                d$rect.ymax <- yy
                return(d)
            })
            dat <- dplyr::bind_rows(d_list) %>% as.data.table()
            
            
            ## set table for number of mutation
            num_table_list <- lapply(unique(dat$BT), function(bt){
                xmin <- min(dat[dat$BT == bt]$rect.xmin)
                xmax <- max(dat[dat$BT == bt]$rect.xmax)
                x <- xmin + (xmax-xmin)/2
                if(bt == "Trunk"){
                    sub <- data.table(x = x, y = 1.03, num = trunk_num)
                }else{
                    sub <- data.table(x = x, y = 1.03, num = branch_num)
                }
                return(sub)
            })
            num_table <- dplyr::bind_rows(num_table_list) %>% as.data.table()

            ## set table for pvalue
            pvalue_table_list <- lapply(unique(dat$Group), function(g){
                v <- unique(dat[dat$Group == g]$P_Value) 
                if(is.na(v)){
                    return(data.table())
                }
                if(v < pvalue){
                    x <- max(dat[dat$Group == g]$rect.xmax)
                    ymin <- dat[dat$Group == g & dat$BT == "Branch"]$rect.ymin
                    ymax <- dat[dat$Group == g& dat$BT == "Branch"]$rect.ymax
                    y <- ymin + (ymax - ymin)/2
                    sub <- data.table(x,y,label = "*")
                    return(sub)
                }else{
                    return(data.table())
                }
            })
            pvalue_table <- dplyr::bind_rows(pvalue_table_list) %>% as.data.table()

            
            ## set segment from trunk to branches
            segment_table_list <- lapply(unique(dat$Group), function(g){
                fractions <- dat[dat$Group == g]$fraction
                if(all(fractions == 0)){
                    return(data.table())
                }
                x1 <- dat[dat$Group == g& dat$BT == "Trunk"]$rect.xmax
                x2 <- dat[dat$Group == g& dat$BT == "Branch"]$rect.xmin
                y1 <- dat[dat$Group == g& dat$BT == "Trunk"]$rect.ymax
                y2 <- dat[dat$Group == g& dat$BT == "Branch"]$rect.ymax
                sub <- data.table(x1,x2,y1,y2)
                return(sub)
            })
            segment_table <- dplyr::bind_rows(segment_table_list) %>% as.data.table()
            

            
            ## set color 
            if(CT){
                all_color <- c("C>A" = "#E64B35FF",
                               "C>G" = "#4DBBD5FF",
                               "T>A" = "#3C5488FF",
                               "T>C" = "#F39B7FFF",
                               "T>G" = "#8491B4FF",
                               "C>T at CpG" = "#00A087FF",
                               "C>T other" = "#91D1C2FF") 
            }else{
                all_color <- c("C>A" = "#E64B35FF",
                               "C>G" = "#4DBBD5FF",
                               "C>T" = "#00A087FF",
                               "T>A" = "#3C5488FF",
                               "T>C" = "#F39B7FFF",
                               "T>G" = "#8491B4FF") 
            }
            
            group_color <- all_color[unique(dat$Group)]
            
            ## initialize variable in ggplot for biocheck error
            rect.xmin <- NULL
            rect.xmax <- NULL
            rect.ymin <- NULL
            rect.ymax <- NULL
            Group <- NULL
            num <- NULL
            label <- NULL
            x <- NULL
            x1 <- NULL
            x2 <- NULL
            y <- NULL
            y1 <- NULL
            y2 <- NULL
            
            ## plot
            pic <- ggplot() + 
                geom_rect(data = dat,
                          aes(xmin = rect.xmin, xmax = rect.xmax,
                              ymin = rect.ymin, ymax = rect.ymax,
                              fill = Group),color = "black")+ 
                
                # ## lable patientid
                # geom_text(data = patient.text.table,
                #           aes(x = p.x, y = 1.1, label = Patient_ID),size = 6)+
                ggtitle(patient) + 
                # label number of mutation
                geom_text(data = num_table,
                          aes(x = x, y = y, label = num),size = 4)+
                
                
                geom_segment(aes(y = 0 ,yend = 1,x=-Inf,xend=-Inf), size = 1.5)+
                theme(panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      panel.background = element_blank(),
                      plot.title = element_text(hjust = 0.48,size = 13.5,vjust = -2),
                      axis.ticks.x = element_blank(),
                      axis.title.y = element_text(size = 11,colour = "black"),
                      axis.text.x = element_text(angle = 60,size = 11,colour = "black",hjust = 1,margin = margin(t = -10)),
                      axis.text.y = element_text(size = 10,colour = "black"),
                      axis.ticks.length.y = unit(.25, "cm"),
                      axis.ticks.y = element_line(size = 1)) +
                scale_fill_manual(values = group_color) + 
                xlab("") + 
                ylab("Proportion")+
                # ggtitle(unique(dat$Patient_ID)) + 
                scale_y_continuous(breaks = c(0,0.25,0.50,0.75,1))+
                scale_x_continuous(breaks = unique(dat$rect.xmin+(dat$rect.xmax - dat$rect.xmin)/2) ,
                                   labels = rep(c("Trunk","Branches"),length(unique(dat$Patient_ID))))
            
            ## label significant pvalue
            if(nrow(pvalue_table) > 0){
                pic <- pic + 
                    geom_text(data = pvalue_table,
                              aes(x = x+0.01, y = y, label = label),
                              size = 7,vjust = 0.8)
            }
            
            ## segment between trunk and branches
            if(nrow(segment_table) > 0){
                pic <- pic + 
                    geom_segment(data = segment_table,
                                 aes(x = x1, xend = x2, y = y1, yend = y2))
            }
        }
        
        if(plot){
            return(list(mutTrunkBranch.res = BT_summary, mutTrunkBranch.plot = pic))
        }else{
            return(list(mutTrunkBranch.res = BT_summary))
        }
    }
    
    ## preprocess input data
    phyloTree_input <- subPhyloTree(phyloTree, patient.id = patient.id)
    ## get trinucleotide matrix
    tri_matrix_list <- subTriMatrix(phyloTree_list = phyloTree_input, CT = CT, level = "6")
    
    result <- lapply(phyloTree_input, processMTB)
    result <- result[!is.na(result)]
    
    if(length(result) == 1){
        return(result[[1]])
    }else if(length(result) == 0){
        return(NA)
    }else{
        return(result)
    }
}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.