R/qqplot.R

Defines functions gg_qqplot qqplot2 qqplot

Documented in gg_qqplot qqplot qqplot2

#' Log QQ p-value plot
#' 
#' Fast function for generating a log quantile-quantile (QQ) p-value plot
#' 
#' @param pval A vector of p-values
#' @param fdr An optional vector of FDR values to save time if previously
#'   computed. If not supplied, these will be calculated using [p.adjust()]
#'   using the Benjamini-Hochberg method.
#' @param fdr_cutoff Cutoff for FDR significance
#' @param scheme Vector of 2 colours for plotting non-significant and
#'   significant SNPs
#' @param npoints Limits the number of non-significant points being plotted to
#'   speed up plotting. See details. Set to `NULL` to plot all points.
#' @param show_plot Logical whether to produce a plot via base graphics or just
#'   return dataframe ready for plotting.
#' @param verbose Whether to show messages
#' @param ... Optional plotting arguments passed to [qqplot2()]
#' @details
#' Produces a fast QQ plot. Particularly useful for analyses with very large
#' numbers of p-values (such as eQTL analysis) which can be slow to plot. The
#' function looks first for all comparisons which reached FDR at the designated
#' cut-off and ensures all of these points are plotted. Additional points which
#' typically overlap substantially near the origin are thinned by random
#' sampling. In this way the plot can be reduced from millions of points to
#' 500,000 points with a plot which is indistinguishable from one with all
#' points plotted. For comparison, set `npoints` to `NULL` to plot all points as
#' usual.
#' 
#' Calling [qqplot()] will result in a base graphics plot. The plotting
#' dataframe is returned invisibly, so users can save time when refining plots
#' by saving the dataframe produced by [qqplot()] and then invoking [qqplot2()]
#' to simply plot the points. Users who prefer ggplot2 can also pass the
#' dataframe generated by [qqplot()] to [gg_qqplot()].
#' @return Generates a plot using base graphics. Also returns a dataframe
#'   invisibly which can be used for downstream plotting via either [qqplot2()]
#'   or [gg_qqplot()].
#' @seealso [qqplot2()] [gg_qqplot()]
#' @export
#' 
qqplot <- function(pval, fdr = NULL, fdr_cutoff = 0.05,
                   scheme = c("darkgrey", "royalblue"),
                   npoints = 5e5, show_plot = TRUE,
                   verbose = TRUE, ...) {
  start <- Sys.time()
  if (verbose) message("Sorting p-values")
  ord <- order(pval, decreasing = TRUE)
  pval <- pval[ord]
  if (!is.null(fdr)) {
    fdr <- fdr[ord]
  } else fdr <- p.adjust(pval, method = "BH")
  if (verbose) message("Processing")
  fdr <- fdr < fdr_cutoff
  n <- length(pval)
  x <- rev(seq_len(n) / n - 1 / (2 * n))
  x <- -log10(x)
  
  if (!is.null(npoints)) {
    # thin non-significant points
    rem <- npoints - sum(fdr)
    if (rem < 1) rem <- 1e5
    cutpoint <- n - sum(fdr)
    sam <- sample(cutpoint, rem)
    ind <- fdr
    ind[sam] <- TRUE
    pval <- pval[ind]; fdr <- fdr[ind]; x <- x[ind]
  }
  
  df <- data.frame(x, pval, fdr)
  df$logp <- -log10(df$pval)
  df$fdr <- as.factor(df$fdr)
  dur <- as.numeric(Sys.time() - start, units = "secs")
  if (verbose) message("Completed processing (", format(dur, digits = 3), " secs)")
  
  if (show_plot) {
    qqplot2(df, scheme = scheme, ...)
  }
  invisible(df)
}


#' Log QQ p-value plot (2nd stage)
#' 
#' Second stage plotting function which accepts dataframe generated by
#' [qqplot()]. This can be used to avoid repeating computation of the QQ plot
#' values.
#' 
#' @param df A dataframe generated by [qqplot()]
#' @param scheme Vector of 2 colours for plotting non-significant and
#'   significant SNPs
#' @param ... Optional plotting arguments passed to [plot()]
#' @return No return value. Produces a base graphics plot.
#' @importFrom graphics abline
#' @export
#' 
qqplot2 <- function(df, scheme = c("darkgrey", "royalblue"), ...) {
  new.args <- list(...)
  abl <- quote(abline(0, 1, col = "red"))
  plot.args <- list(x = df$x, y = df$logp, col = scheme[df$fdr],
                    pch = 20, las = 1,
                    xlab = expression(-log[10]("expected p-value")),
                    ylab = expression(-log[10]("observed p-value")),
                    bty = "l", mgp = c(1.8, 0.5, 0), tcl = -0.3,
                    panel.first = abl)
  if (length(new.args)) plot.args[names(new.args)] <- new.args
  do.call("plot", plot.args)
}


#' Log QQ p-value plot (ggplot2)
#' 
#' Produces a QQ plot via ggplot2. Requires a dataframe generated by
#' [qqplot()].
#' 
#' @param df A dataframe generated by [qqplot()]
#' @param scheme Vector of 2 colours for plotting non-significant and
#'   significant SNPs
#' @return A ggplot2 graphics plot object
#' @importFrom ggplot2 ggplot aes geom_point theme_classic theme element_text
#'   scale_color_discrete xlab ylab geom_abline
#' @importFrom rlang .data
#' @export
#' 
gg_qqplot <- function(df, scheme = c("darkgrey", "royalblue")) {
  ggplot(df, aes(x = .data$x, y = .data$logp, color = .data$fdr)) +
    geom_abline(intercept = 0, slope = 1, color = "red") +
    geom_point() +
    scale_color_discrete(type = scheme) +
    xlab(expression(-log[10]("expected p-value"))) +
    ylab(expression(-log[10]("observed p-value"))) +
    theme_classic() +
    theme(axis.text = element_text(colour = "black"),
          legend.position = "none")
}

Try the easylabel package in your browser

Any scripts or data that you put into this service are public.

easylabel documentation built on May 29, 2024, noon