R/plot_cutoff.R

Defines functions plot_cutoff

Documented in plot_cutoff

#' @title Check number of DE genes at different cutoff combinations
#' @description This function processes summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} to evaluate the number of differntially expressed genes with different FDR and
#' fold change cutoff.
#'
#' @param data Summary statistics table or a list of summary statistics tables from limma or DEseq2, where each row is a gene.
#' @param comp.names A character vector that contains the comparison names which correspond to the same order as \code{data}.
#' @param FCflag The column name of the log2FC in the summary statistics table. Default = "logFC".
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table. Default = "adj.P.Val".
#' @param FCmin The minimum starting fold change cutoff to be checked, so the minimum fold change cutoff to be evaluated
#' will be FCmin + FCstep, FCmin default = 1.
#' @param FCmax The maximum fold change cutoff to be checked, default = 2.
#' @param FCstep The step from the minimum to maximum fold change cutoff, one step increase at a time, default = 0.01.
#' @param p.min The minimum starting FDR cutoff to be checked, so the minimum fold change cutoff to be evaluated
#' will be p.min + p.step, p.min default = 0.
#' @param p.max The maximum FDR cutoff to be checked, default = 0.2.
#' @param p.step The step from the minimum to maximum fold change cutoff, one step increase at a time, default = 0.005.
#' @param plot.save.to The address where to save the plot from simplified cutoff combination with FDR of 0.01, 0.05, 0.1, and 0.2.
#' @param gen.3d.plot Whether generate a 3d plotly object to visualize the result, only applys to single dataframe input, default = F.
#' @param gen.plot Whether generate a plot to visualize the result, default = T.


#'
#'
#' @return If the input \code{data} is a data list, then a multi-facet ggplot plot object which contains each
#' of the summary statistics table will be returned; otherwise, if the input \code{data} is a data frame, then the function will return a list which contains 3 elements:
#' \item{df.sub}{A dataframe, which contains the number of genes(3rd column) with FDR (1st column), Fold Change (2nd column)}
#' \item{plot3d}{A plotly object to show the 3d illustration of all possible cutoff selectiosn and
#' the number of DE genes in the 3d surface}
#' \item{gp}{A ggplot object to show the simplified cutoff combination result}
#'
#'
#' @importFrom dplyr %>% filter as_tibble mutate bind_rows
#' @importFrom ggplot2 ggsave ggplot geom_bar geom_text aes theme_minimal labs position_dodge element_text scale_fill_viridis_d facet_grid
#' @importFrom haven as_factor
#' @importFrom purrr map2 set_names map
#' @importFrom plotly plot_ly add_markers layout
#' @importFrom rlang .data
#'
#' @export plot_cutoff
#'
#' @references Xingpeng Li & Olya Besedina, RVA - RNAseq Visualization Automation tool.
#'
#' @details The function takes the summary statistics and returns a list which contains 3 objects: a table which describes the number
#' of DE genes with different cutoff combinations of FDR and fold change, a ggplot object which depicts a simplified version of
#' cutoff selection combination, and a plotly 3d visulization object which depicts a high resolution of cutoff combinations.
#' The default range of the fold change is from 1 to 2, and p value is from 0 to 0.2, with the step of 0.01
#' for FC and 0.005 for FDR.
#'
#'
#' @examples
#' plot_cutoff(Sample_summary_statistics_table)
#'
#'plot_cutoff(data = list(Sample_summary_statistics_table, Sample_summary_statistics_table1),
#'            comp.names = c("A", "B"))
#'
#'
#'

plot_cutoff <- function(data = data,
                        comp.names = NULL,
                        FCflag = "logFC",
                        FDRflag = "adj.P.Val",
                        FCmin = 1.2,
                        FCmax = 2,
                        FCstep = 0.1,
                        p.min = 0,
                        p.max = 0.2,
                        p.step = 0.01,
                        plot.save.to = NULL,
                        gen.3d.plot=TRUE,
                        gen.plot=TRUE){


  suppressWarnings({
    suppressMessages({

      validate.FC(FCmin, FCmax, FCstep)
      validate.pvals(p.min, p.max, p.step)

      plot.save.exists <- !is.null(plot.save.to)

      FCs <- seq(from = FCmin, to = FCmax, by = FCstep)

      if(inherits(data, "list")) {
        validate.comp.names(comp.names, data)

        message(paste0("The p.step parameters are",
                     " ignored with list inputs for simplified output."))

        pvalues <- c(0.01, 0.05, 0.1, 0.2)

        df <- map(data, plot_cutoff_single,
                  FCflag, FDRflag,
                  FCs,
                  pvalues) %>%
          set_names(comp.names) %>%
          bind_rows(, .id = "Comparisons.ID") %>%
          mutate(Comparisons.ID = factor(.data$Comparisons.ID,levels = comp.names))
      } else {
        pvalues <- seq(from = p.min, to = p.max, by = p.step)
        df <- plot_cutoff_single(data,
                                 FCflag,
                                 FDRflag,
                                 FCs,
                                 pvalues)
      }


      if(inherits(data, "list")) {
        gp <- get.cutoff.ggplot(df, FCflag, FDRflag)
        gp <- gp + facet_grid(Comparisons.ID ~ .)
        if(gen.3d.plot) {
          warning(paste0("Although gen.3d.plot = T, no 3D plots are",
                         " generated when the input is a list. To",
                         " create 3D plots for a list of dataframes,",
                         " run plot_cutoff on each dataframe separately.")
          )
        }

        if(!gen.plot) {
          return(df)
        }

        if(plot.save.exists) {
          ggsave(filename = plot.save.to,
                 plot = gp,
                 dpi = 300,
                 units = "in",
                 device='png')

        }
        return(gp)
      }

      #following only run when input is a single dataframe
      produce.cutoff.message(data,
                             FCmin, FCmax, FCstep,
                             FDRflag, p.min, p.max, p.step)

      plot3d <- NULL
      if(gen.3d.plot){
        plot3d <- make.cutoff.plotly(df)
      }

      if(gen.plot) {
        df.sub <- df %>%
          filter(.data$FC %in% FCs, .data$pvalue %in% c(0.01, 0.05, 0.1, 0.2))
        gp <- get.cutoff.ggplot(df.sub, FCflag, FDRflag)

        if(!plot.save.exists) {
          warning(paste0("Plot file name not specified, a plot in",
                         " ggplot object will be output to the second",
                         " object of the return list!")
          )

        } else {
          ggsave(filename = plot.save.to,
                 plot = gp,
                 dpi = 300,
                 units = "in",
                 device='png')
        }

      }

      return(list(df, plot3d, gp))
    })
  })
}

#' @title Create plotly object for number of DE genes at different cutoff combinations
#' @description This function processes summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} to produce an interactibe visual object
#' which depicts the number of differntially expressed genes with different FDR and
#' fold change cutoff.
#'
#' @param df Summary statistics table from limma or DEseq2, where each row is a gene.
#'
#' @importFrom dplyr %>% filter as_tibble mutate bind_rows
#' @importFrom plotly plot_ly add_markers layout
#' @importFrom rlang .data
#'

make.cutoff.plotly <- function(df) {
  df0 <- df %>% mutate(FC = as.numeric(as.character(.data$FC)),
                       pvalue = as.numeric(as.character(.data$pvalue)))
  plot_ly(data = df0,
          x = ~FC,
          y = ~pvalue,
          z = ~Number_of_Genes,
          opacity = 0.5,
          type = "scatter3d",
          mode = "markers",
          color = ~Number_of_Genes,
          size = 1) %>%
    add_markers() %>%
    layout(scene = list(xaxis = list(title = 'Fold Change'),
                        yaxis = list(title = 'FDR'),
                        zaxis = list(title = 'Number of DE genes')))
}

#' @title Create plotly object for number of DE genes at different cutoff combinations
#' @description This function processes summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} and produces a table which contains gene counts
#'  for each of the pvalue and FC combination
#'
#' @param datin Summary statistics table from limma or DEseq2, where each row is a gene.
#' @param pvalues A set of p-values for FDR cutoff to be checked.
#' @param FCs A set of fold change cutoff to be checked.
#' @param FCflag The column name of the log2FC in the summary statistics table.
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table.
#'

plot_cutoff_single <- function(datin,
                               FCflag,
                               FDRflag,
                               FCs,
                               pvalues) {

  validate.stats(datin, FCflag, FDRflag)

  produce.cutoff.warning(datin, FDRflag)

  df <- get.cutoff.df(datin, pvalues, FCs, FCflag, FDRflag)

  return(df)
}

#' @title Create ggplot object for number of differntially expressed genes with
#' different FDR and fold change cutoff.
#' @description This function processes dataframe from plot_cutoff_single function
#' and produces a ggplot object which depicts the number of differntially expressed
#' genes with different FDR and fold change cutoff.
#'
#' @param df Dataframe from plot_cutoff_single.
#' @param FCflag The column name of the log2FC in the summary statistics table.
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table.
#' @importFrom ggplot2 ggsave ggplot geom_bar geom_text aes theme_minimal labs position_dodge element_text scale_fill_viridis_d facet_grid

get.cutoff.ggplot <- function(df, FCflag, FDRflag) {

  ggplot(df, aes(x=.data$FC, y=.data$Number_of_Genes, fill=.data$pvalue)) +
    geom_bar(stat="identity", position=position_dodge()) +
    geom_text(aes(label=.data$Number_of_Genes), vjust=-0.3, color="black",
              position = position_dodge(0.9), size=3.5) +
    labs(x = FCflag, fill = FDRflag) +
    ylab("Number of Genes under cutoff") +
    xlab("FC") +
    labs(fill = 'FDR') +
    theme(strip.text.y = element_text(angle = 0)) +
    scale_fill_viridis_d()

}

#a simpler version to output in result in dataframe

#' @title Create ggplot object for number of differntially expressed genes with
#' different FDR and fold change cutoff.
#' @description This function processes dataframe from plot_cutoff_single function
#' and produces a ggplot object which depicts the number of differntially expressed
#' genes with different FDR and fold change cutoff.
#'
#' @param datin Dataframe from plot_cutoff_single.
#' @param FCflag The column name of the log2FC in the summary statistics table.
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table.
#' @param pvalues A set of p-values for FDR cutoff to be checked.
#' @param FCs A set of fold change cutoff to be checked.
#'
#' @importFrom dplyr %>% filter as_tibble mutate bind_rows
#' @importFrom haven as_factor
#' @importFrom purrr map2 set_names map
#' @importFrom rlang .data
#' @importFrom stats complete.cases

get.cutoff.df <- function(datin,
                          pvalues,
                          FCs,
                          FCflag = "logFC",
                          FDRflag = "adj.P.Val") {

  df <- expand.grid(list(pvalues, FCs)) %>%
    as_tibble %>%
    stats::setNames(c("pvalue","FC"))

  datin <- datin[,c(FDRflag,FCflag)] %>% stats::setNames(c("DAT_FDR","DAT_FC"))
  datin <- datin[complete.cases(datin), ] #only plot complete cases in the result

  get.gene.num <- function(FC, pvalue, FCflag, FDRflag){
    num.pass <- datin %>%
      filter(.data$DAT_FDR <= pvalue , abs(.data$DAT_FC) >= log2(FC)) %>%
      nrow
    return(num.pass)
  }

  df$Number_of_Genes <- unlist(map2(df$FC, df$pvalue, get.gene.num))

  df <- df %>%
    mutate(pvalue = as.factor(.data$pvalue),
           FC = as.factor(.data$FC),
           Number_of_Genes = as.numeric(.data$Number_of_Genes))
  return(df)
}

#' @title Create a warning about pvalue or FDR minimum value
#' @description This function processes summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} and produces a warning about pvalue or FDR minimum value
#'
#' @param data Summary statistics table from limma or DEseq2, where each row is a gene.
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table.

produce.cutoff.warning <- function(data,
                                   FDRflag) {

  if(min(data[,FDRflag]) > 0.05){
    warning(paste0("This summary statistics table has a minimum",
                   " pvalue or FDR of ", min(data[,FDRflag]),
                   ", it is likely there are no DE genes in this",
                   " summary statistics table."))
    return(TRUE)
  }
  FALSE
}

#' @title Create a message about fold change and pvalues used to produce the plot.
#' @description This function processes summary statistics table generated by differential expression analysis
#' like \code{limma} or \code{DESeq2} and produces a message about pvalues and fold change used.
#'
#' @param data Summary statistics table from limma or DEseq2, where each row is a gene.
#' @param FDRflag The column name of the False Discovery Rate (FDR) in the summary statistics table.
#' @param FCmin The minimum starting fold change cutoff to be checked, so the minimum fold change cutoff to be evaluated
#' will be FCmin + FCstep, FCmin default = 1.
#' @param FCmax The maximum fold change cutoff to be checked, default = 2.
#' @param FCstep The step from the minimum to maximum fold change cutoff, one step increase at a time, default = 0.01.
#' @param p.min The minimum starting FDR cutoff to be checked, so the minimum fold change cutoff to be evaluated
#' will be p.min + p.step, p.min default = 0.
#' @param p.max The maximum FDR cutoff to be checked, default = 0.2.
#' @param p.step The step from the minimum to maximum fold change cutoff, one step increase at a time, default = 0.005.

produce.cutoff.message <- function(data,
                                   FCmin, FCmax, FCstep,
                                   FDRflag, p.min, p.max, p.step) {

  if(!produce.cutoff.warning(data, FDRflag)) {
    message(paste0("With the fold change from ",FCmin," to ",FCmax,
               " using a step of ", FCstep," and with ", FDRflag,
               " values ranging from ",p.min," to ",p.max,
               " with a step of ", p.step, ", \nIn total,",
               ((FCmax - FCmin)/FCstep)*((p.max-p.min)/p.step),
               " cutoff combinations can be visualized in  the 3d plot.\n"))
  }

}

Try the RVA package in your browser

Any scripts or data that you put into this service are public.

RVA documentation built on Nov. 2, 2021, 1:06 a.m.