R/plot_volcano.R

Defines functions plot_volcano

Documented in plot_volcano

#' @title Plot volcanoplot
#' @description This function processes the summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} to show on the volcano plot with the highlight gene set option (like disease
#' related genes from Disease vs Healthy comparison).
#'
#' @param data Summary statistics table or a list contain multiple summary statistics tables from limma or DEseq2, where each row is a gene.
#' @param comp.names A character vector that contains the comparison names which correspond to the same order as \code{data}. Required if \code{data} is list. No default.
#' @param geneset Summary statistic table that contains the genes which needed to be highlighted, the gene name format (in row names)
#' needs to be consistent with the main summary statistics table). For example, this summary statistics
#'  table could be the output summary statistics table from the Disease vs Healthy comparison (Only contains
#'  the subsetted significant genes to be highlighted).
#' @param geneset.FCflag The column name of fold change in \code{geneset}, Default = "logFC".
#' @param highlight.1 Genes to be highlighted, in the format of a vector consists of gene names. The gene name format
#'  needs to be consistent to the main summary statistics table.
#' @param highlight.2 Genes to be highlighted, in the format of a vector consists of gene names. The gene name format
#'  needs to be consistent to the main summary statistics table.
#' @param upcolor The color of the gene names in \code{highlight.1} or the positive fold change gene
#' in \code{geneset}, default = "#FDE725FF" (viridis color palette).
#' @param downcolor The color of the gene names in \code{highlight.2} or the negative fold change gene
#' in \code{geneset}, default = "#440154FF" (viridis color palette).
#' @param plot.save.to The file name and address where to save the volcano plot, e.g. "~/address_to_folder/volcano_plot.png".
#' @param xlim Range of x axis. Default = \code{c(-3,3)}.
#' @param ylim Range of x axis. Default = \code{c(0,6)}.
#' @param FCflag Column name of log2FC in the summary statistics table. Default = "logFC".
#' @param FDRflag Column name of FDR in the summary statistics table. Default = "adj.P.Val".
#' @param highlight.FC.cutoff Fold change cutoff line want to be shown on the plot. Default = 1.5.
#' @param highlight.FDR.cutoff FDR cutoff shades want to be shown on the plot. Default = 0.05.
#' @param title The plot title. Default "Volcano plot".
#' @param xlab The label for x-axis. Default "log2 Fold Change".
#' @param ylab The label for y-axis. Default "log10(FDR)".
#'
#'
#' @return The function return a volcano plot as a ggplot object.
#'
#' @importFrom ggplot2 ggsave ggplot geom_point geom_vline xlim ylim labs theme geom_hline xlab ylab ggtitle facet_wrap
#' @importFrom dplyr %>%
#' @importFrom tibble as_tibble
#' @importFrom tidyr separate
#' @importFrom purrr map
#' @importFrom rlang .data
#'
#' @export plot_volcano
#'
#' @references Xingpeng Li & Tatiana Gelaf Romer & Olya Besedina, RVA - RNAseq Visualization Automation tool.
#'
#' @details The function takes the summary statistics table and returns a ggplot, with the option to highlight
#' genes, e.g. disease signature genes, the genes which are up-regulated and down-regulated in diseased subjects.
#'
#'
#' @examples
#' plot_volcano(data = Sample_summary_statistics_table,
#'              geneset = Sample_disease_gene_set)
#'
#' plot_volcano(data = list(Sample_summary_statistics_table, Sample_summary_statistics_table1),
#'             comp.names = c("A", "B"),
#'             geneset = Sample_disease_gene_set)



plot_volcano <- function(data=data,
                         comp.names = NULL,
                         geneset = NULL,
                         geneset.FCflag = "logFC",
                         highlight.1 = NULL,
                         highlight.2 = NULL,
                         upcolor = "#FF0000",
                         downcolor = "#0000FF",
                         plot.save.to = NULL,
                         xlim = c(-4,4),
                         ylim = c(0,12),
                         FCflag = "logFC",
                         FDRflag = "adj.P.Val",
                         highlight.FC.cutoff = 1.5,
                         highlight.FDR.cutoff = 0.05,
                         title= "Volcano plot",
                         xlab = "log2 Fold Change",
                         ylab = "log10(FDR)"
){
        message(paste0("\n\n Running plot volcano... Please make sure gene id type(rownames) of `data` consistent to that of `geneset` (if provided). \n\n"))


        suppressWarnings({
        suppressMessages({


        validate.geneset(data, geneset, highlight.1, highlight.2)

        if(is.data.frame(geneset)) {
                if(inherits(data, 'list')){
                        #proposed change here instead of referrin to rownames of data[[1]], gonna take intersect of all rows present
                        geneset.FC.thresh <- geneset[,geneset.FCflag] > 0
                        geneset.genes <- rownames(geneset)
                        #collect all rownames from list of dataframes and find intersect amongst all of them
                        rname_colec <- map(data,rownames)
                        common_rnames <- Reduce(intersect,rname_colec)

                        message(paste0("\n\n Provided input list had a total of ", length(common_rnames),
                                   " in common, non-common gene id will not be considered. \n\n"))
                        highlight.1 <- geneset.genes[geneset.FC.thresh] %>%
                                as.character %>%
                                intersect(common_rnames) # need to add to validator to compare the gene names consistent across the data.list
                        highlight.2 <- geneset.genes[!geneset.FC.thresh] %>%
                                as.character %>%
                                intersect(common_rnames)
                } else {

                        geneset.FC.thresh <- geneset[,geneset.FCflag] > 0
                        geneset.genes <- rownames(geneset)
                        highlight.1 <- geneset.genes[geneset.FC.thresh] %>%
                                as.character %>%
                                intersect(rownames(data)) # need to add to validator to compare the gene names consistent across the data list
                        highlight.2 <- geneset.genes[!geneset.FC.thresh] %>%
                                as.character %>%
                                intersect(rownames(data))
                }
        }

        if(inherits(data, "list")){
                #check if same length of the data list names are provided
                if(!is.null(comp.names) & length(data) != length(comp.names)){
                        message("Please make sure the provided summary statistics list 'dat.list' has the same length as 'comp.names'.")
                        return(NULL)
                }

                data.n <- set_names(data,comp.names)

                data.p <- map(data.n, as_tibble, rownames = "gene") %>%
                  bind_rows(.id = "Comparisons.ID")
                
				data.p$Comparisons.ID = factor(data.p$Comparisons.ID, levels = comp.names)


        }else{

                data.p <- data
                data.p$gene <- rownames(data.p)
        }

        #log transform the cutoff lines on the plot
        highlight.FC.cutoff <- log2(highlight.FC.cutoff)
        highlight.FDR.cutoff <- -log10(highlight.FDR.cutoff)

        #rename the column names for plotting
        colnames(data.p)[which(names(data.p) == FCflag)] <- "FC"
        colnames(data.p)[which(names(data.p) == FDRflag)] <- "FDR"

        p <- ggplot() +
                geom_point(data=data.p, aes(x = .data$FC, y = -log10(.data$FDR)),color="grey") +
				geom_point(data=data.p[data.p$gene %in% highlight.1,],
							   aes(x = .data$FC, y = -log10(.data$FDR)),
							   color=upcolor,
							   size=1.5,
							   shape = 21) +
					geom_point(data=data.p[data.p$gene %in% highlight.2,],
							   aes(x = .data$FC, y = -log10(.data$FDR)),
							   color=downcolor,
							   size=1.5,
							   shape = 21) +
                geom_vline(xintercept=-log2(highlight.FC.cutoff), linetype="dashed", colour = "black") +
                geom_vline(xintercept=log2(highlight.FC.cutoff), linetype="dashed", colour = "black") +
                geom_hline(yintercept= -log10(highlight.FDR.cutoff),color = "grey") +
                ylim(ylim) +
                xlim(xlim) +
                ggtitle(title) +
                xlab(xlab) +
                ylab(ylab)
				
		
        if(inherits(data, "list")){
                p <- p + facet_wrap(facets = "Comparisons.ID", nrow = 1)
        }


        if(is.null(plot.save.to)){
                p
        }else{
                ggsave(filename = plot.save.to,
                       plot = p,
                       dpi = 300,
                       units = "in",
                       device='png')
        }

        data.is.list <- inherits(data, "list")

        if(data.is.list) {
                validate.comp.names(comp.names, data)
        }

        return(p)
        })
        })
}

# make a warning in geneset.type to say that row names must be gene names;

Try the RVA package in your browser

Any scripts or data that you put into this service are public.

RVA documentation built on Nov. 2, 2021, 1:06 a.m.