chipmine: Analyze ChIP-seq Data

Documented in chip_summary

#' TF ChIPseq data summary plots
#'
#' This method generates various summary plots for TF ChIPseq data. \cr
#' \itemize{
#' \item A table summarizing quantils of peak enrichment, p-value and q-value
#' \item A distribution beeswarm plot for peak enrichment values.
#' \item A distribution beeswarm plot for peak p-value.
#' \item A distribution beeswarm plot for peak width.
#' \item A pie chart showing peak annotation distribution.
#' }
#' If there is no peak information, it returns blank plot with "No Data" text
#'
#' @param sampleId sampleId
#' @param peakAnnotation macs2 peak annotation file generated by \code{narrowPeak_annotate()}
#' @param peakFile narrowPeak or broadPeak file generated by macs2
#' @param peakFormat One of narrowPeak or broadPeak
#' @param pval_cutoff A cutoff for -log10(p-value) for drawing horizontal line in beeswarm
#' plot. Default: 20
#' @param enrichment_cutoff A cutoff for peak enrichment for drawing horizontal line in
#' beeswarm plot. Default: 3
#' @param markTargets A named list of target genes which needs to be annotated in the
#' scatter plot. Default: NULL
#' @param pointColor A named vector with colors for each target category in \code{markTargets}
#' list. Default: NULL
#' @param pointAlpha A named vector of alpha vlues for each target category in
#' \code{markTargets} list. Default: NULL

#'
#' @return A list with following elements is returned.
#' \itemize{
#' \item \strong{data:} plot data used for plotting
#' \item \strong{figure:} A combined figure generated by \code{ggpubr::ggarrange}
#' \item \strong{plots:} Individual ggplot list for each plot.
#' \itemize{
#' \item \strong{table:} a \code{ggtable} object for quantile stats
#' \item \strong{distribution:} A list with two beeswarm plot objects: enrichment, pval.
#' \item \strong{annoPie:} A pie chart for peak annotation distribution
#' }}
#'
#' @importFrom ggpubr annotate_figure ggarrange
#'
#' @export
#'
#' @examples NA
chip_summary <- function(sampleId, peakAnnotation, peakFile, peakFormat,
                         pval_cutoff = 20, enrichment_cutoff = 3,
                         markTargets = NULL, pointColor = NULL, pointAlpha = NULL){

  peakFormat <- match.arg(peakFormat, choices = c("narrowPeak", "broadPeak"))

  if(!is.null(markTargets)){
    if(!is.list(markTargets)) stop("markTargets should be a named list")

    if(!all(names(markTargets) %in% names(pointColor))){
      stop("pointColor should be a named vector with same names as markTargets")
    }

    if(!all(names(markTargets) %in% names(pointAlpha))){
      stop("pointAlpha should be a named vector with same names as markTargets")
    }
  }

  pointColor["peaks"] <- "grey"
  pointAlpha["peaks"] <- 0.5

  markingPreference <- tibble(geneType = c(names(markTargets), "peaks"),
                              drawOrder = 1:(length(markTargets)+1))

  peakMarkDf <- purrr::map_dfr(
    .x = markTargets,
    .f = function(x){tibble(geneId = x, stringsAsFactors = F)},
    .id = "geneType")

  peaksGr <- rtracklayer::import(con = peakFile, format = peakFormat)


  if(file.exists(peakAnnotation)){

    ## import peak data with annotation
    peakAnno <- markPeaks::import_peak_annotation(
      sampleId = sampleId,
      peakAnnoFile = peakAnnotation,
      column_suffix = FALSE
    )

    ## add target gene type information if points need to be colored
    if(!is.null(markTargets)){
      peakAnno <- dplyr::left_join(x = peakAnno, y = peakMarkDf, by = "geneId")

      pointAlpha["peaks"] <- 1

    } else{
      peakAnno <- dplyr::mutate(peakAnno, geneType = "peaks")
    }

    peakAnno <- peakAnno %>%
      tidyr::replace_na(replace = list(geneType = "peaks")) %>%
      dplyr::left_join(y = markingPreference, by = c("geneType" = "geneType"))

    ## for each peak, select one gene. preference is decided by order of names in markTargets list
    plotData <- dplyr::group_by(peakAnno, peakId) %>%
      dplyr::arrange(drawOrder) %>%
      dplyr::slice(1L) %>%
      dplyr::ungroup() %>%
      dplyr::distinct() %>%
      dplyr::mutate(
        sampleId = sampleId,
        peakWidth = peakEnd - peakStart + 1,
        geneType = forcats::fct_relevel(.f = geneType, !!!markingPreference$geneType)
      )

    # plotData$geneType <- factor(
    #   x = plotData$geneType, levels = markingPreference$geneType, ordered = TRUE
    # )


    ## summary table
    summaryTable <- purrr::map_dfr(
      .x = structure(c("peakEnrichment", "peakPval", "peakQval"),
                     names = c("peakEnrichment", "peakPval", "peakQval")),
      .f = function(x){
        as.list(summary(plotData[[x]]))
      },
      .id = "metric")

    gg_stable <- ggpubr::ggtexttable(summaryTable, rows = NULL,
                                     theme = ttheme("mOrange"))


    ## set outliers to 99.5 quantile
    if(nrow(plotData) > 20){
      plotData$peakEnrichment <- pmin(plotData$peakEnrichment, quantile(plotData$peakEnrichment, 0.995))
      plotData$peakPval <- pmin(plotData$peakPval, quantile(plotData$peakPval, 0.99))
      plotData$peakWidth <- pmin(plotData$peakWidth, quantile(plotData$peakWidth, 0.99))
    }

    # quantile(plotData$peakEnrichment,
    #          c(seq(0, 0.9, by = 0.1), 0.95, 0.99, 0.992, 0.995, 0.999, 0.9999, 1), na.rm = T)
    #
    # quantile(plotData$peakPval,
    #          c(seq(0, 0.9, by = 0.1), 0.95, 0.99, 0.992, 0.995, 0.999, 0.9999, 1), na.rm = T)

    theme_scatter <- theme_bw() +
      theme(
        axis.title.x = element_blank(),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 12),
        panel.grid = element_blank()
      )

    ## macs2 fold-enrichment density scatter plot
    gg_dot_enrichment <- ggplot(
      data = plotData,
      mapping = aes(x = !!sampleId, y = peakEnrichment)) +
      geom_hline(yintercept = enrichment_cutoff, color = "black", linetype = "dashed") +
      ggbeeswarm::geom_quasirandom(mapping = aes(color = geneType, alpha = geneType)) +
      geom_boxplot(width=0.1, fill = NA, outlier.colour = NA, color = alpha("black", 0.7)) +
      scale_color_manual(name = "Peak annotations",
                         values = pointColor) +
      scale_alpha_manual(values = pointAlpha) +
      labs(title = "macs2 fold enrichment distribution",
           y = "peak enrichment") +
      guides(alpha = FALSE,
             color = guide_legend(override.aes = list(size = 4))) +
      theme_scatter



    ## macs2 p-value density scatter plot
    gg_dot_pval <- ggplot(
      data = plotData,
      mapping = aes(x = !!sampleId, y = peakPval)) +
      geom_hline(yintercept = pval_cutoff, color = "black", linetype = "dashed") +
      ggbeeswarm::geom_quasirandom(mapping = aes(color = geneType, alpha = geneType)) +
      geom_boxplot(width=0.1, fill = NA, outlier.colour = NA, color = alpha("black", 0.7)) +
      scale_color_manual(name = "Peak annotations",
                         values = pointColor) +
      scale_alpha_manual(values = pointAlpha) +
      labs(title = "macs2 p-value distribution",
           y = "-log10(p-value)") +
      guides(alpha = FALSE,
             color = guide_legend(override.aes = list(size = 4))) +
      theme_scatter


    ## peak width distribution
    gg_dot_width <- ggplot(
      data = plotData,
      mapping = aes(x = !!sampleId, y = peakWidth)) +
      geom_hline(yintercept = 300, color = "black", linetype = "dashed") +
      ggbeeswarm::geom_quasirandom(mapping = aes(color = geneType, alpha = geneType)) +
      geom_boxplot(width=0.1, fill = NA, outlier.colour = NA, color = alpha("black", 0.7)) +
      scale_color_manual(name = "Peak annotations",
                         values = pointColor) +
      scale_alpha_manual(values = pointAlpha) +
      labs(title = "macs2 peak width distribution",
           y = "peak width (bp)") +
      guides(alpha = FALSE,
             color = guide_legend(override.aes = list(size = 4))) +
      theme_scatter

    ## peak annotation pie chart
    peakAnSummary <- dplyr::group_by(peakAnno, peakAnnotation, peakCategory) %>%
      dplyr::summarise(count = n()) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(label = round(count / sum(count), digits = 4)) %>%
      dplyr::mutate(
        peakAnnotation = forcats::fct_relevel(
          peakAnnotation,
          "upstream", "promoter", "5UTR", "tx_start", "include_tx", "EXON",
          "INTRON", "tx_end", "3UTR", "intergenic"
        )
      ) %>%
      dplyr::arrange(desc(peakAnnotation)) %>%
      dplyr::mutate(
        ypos = cumsum(count)- 0.5*count
      )

    peakAnnotationCol <- structure(
      .Data = rainbow(n = length(levels(peakAnSummary$peakAnnotation))),
      names = length(levels(peakAnSummary$peakAnnotation))
    )

    gg_pie_peakAn <- ggplot(
      data = peakAnSummary,
      mapping = aes(x = 1, y = count, fill = peakAnnotation, label = scales::percent(label))
    ) +
      geom_bar(color = "black", stat = "identity") +
      coord_polar(theta = "y", start = 0, direction = -1) +
      geom_label_repel(
        mapping = aes(x = 1.5, y = ypos),
        hjust = 0.5, direction = "x",
        size = 5, show.legend = FALSE) +
      scale_fill_manual(
        name = "Peak annotations",
        values = peakAnnotationCol
      ) +
      scale_x_continuous(limits = c(0, 1.75), expand = expansion(add = c(0, 0))) +
      labs(title = "Peak annotation distribution") +
      theme_bw() +
      theme(
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        # panel.border = element_blank(),
        panel.grid=element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        plot.title=element_text(size=14, face="bold")
      )

    plotList <- list(
      table = gg_stable,
      distribution = list(enrichment = gg_dot_enrichment,
                          pval = gg_dot_pval,
                          width = gg_dot_width),
      annoPie = gg_pie_peakAn
    )

    ## combine plots + table to make a summary figure
    summaryFig <- ggpubr::ggarrange(
      ggpubr::ggarrange(gg_dot_enrichment, gg_dot_pval, gg_dot_width,
                        nrow = 1, ncol = 3, legend = "bottom", common.legend = TRUE),
      ggpubr::ggarrange(gg_stable, gg_pie_peakAn,
                        nrow = 1, ncol = 2, legend = "right", widths = c(1, 1),
                        hjust = 0),
      nrow = 2,
      heights = c(5, 5)
    ) +
      theme(
        plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm")
      )

    summaryFig <- ggpubr::annotate_figure(
      p = summaryFig,
      top = ggpubr::text_grob(
        label = paste(sampleId, "ChIPseq summary (#peaks =",length(peaksGr), ")"),
        size = 16, face = "bold")
    )

  } else{

    ## blank figure
    plotData <- NULL

    summaryFig <- ggpubr::annotate_figure(
      p = ggplot() + geom_text(mapping = aes(x = 0.5, y = 0.5), label = "No data", size = 30) +
        theme_void(),
      top = ggpubr::text_grob(
        label = paste(sampleId, "ChIPseq summary\n#peaks =",length(peaksGr)),
        size = 16, face = "bold")
    )

    plotList <- list(table = NULL,
                     distribution = list(enrichment = NULL, pval = NULL),
                     annoPie = NULL)

  }

  return(list(
    data = plotData,
    figure = summaryFig,
    plots = plotList
  ))

}




##################################################################################