bsfR: BSF R package

#!/usr/bin/env Rscript
#
# Copyright 2013 - 2022 Michael K. Schuster
#
# Biomedical Sequencing Facility (BSF), part of the genomics core facility
# of the Research Center for Molecular Medicine (CeMM) of the
# Austrian Academy of Sciences and the Medical University of Vienna (MUW).
#
#
# This file is part of BSF R.
#
# BSF R is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BSF R is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with BSF R.  If not, see <http://www.gnu.org/licenses/>.

# Description -------------------------------------------------------------


# This script uses the Bioconductor ChIPpeakAnno package to annotate the
# consensus peak set, as well as contrast peak sets of a differential binding
# analysis carried out by the Bioconductor DiffBind package via the
# bsf_chipseq_diff_bind.R script. The annotation is based on a Biocoductor TxDb
# object, as well as its corresponding GTF file that provides additional
# annotation, such as gene symbols and gene biotypes.

# Option Parsing ----------------------------------------------------------


suppressPackageStartupMessages(expr = library(package = "optparse"))

argument_list <-
  optparse::parse_args(object = optparse::OptionParser(
    option_list = list(
      optparse::make_option(
        opt_str = "--verbose",
        action = "store_true",
        default = TRUE,
        help = "Print extra output [default]",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--quiet",
        action = "store_false",
        default = FALSE,
        dest = "verbose",
        help = "Print little output",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--comparison",
        dest = "comparison",
        help = "Comparison name",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--factor",
        dest = "factor",
        help = "ChIP factor",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--fdr-threshold",
        default = 0.05,
        dest = "fdr_threshold",
        help = "FDR threshold [0.05]",
        type = "double"
      ),
      optparse::make_option(
        opt_str = "--gtf-reference",
        default = NULL,
        dest = "gtf_reference",
        help = "Reference transcriptome GTF file path",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--txdb-path",
        dest = "txdb_path",
        help = "Reference transcriptome Bioconductor TxDb file path",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--genome-directory",
        default = ".",
        dest = "genome_directory",
        help = "Genome directory path [.]",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--output-directory",
        default = ".",
        dest = "output_directory",
        help = "Output directory path [.]",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--plot-width",
        default = 7.0,
        dest = "plot_width",
        help = "Plot width in inches [7.0]",
        type = "double"
      ),
      optparse::make_option(
        opt_str = "--plot-height",
        default = 7.0,
        dest = "plot_height",
        help = "Plot height in inches [7.0]",
        type = "double"
      )
    )
  ))

# Library Import ----------------------------------------------------------


# CRAN r-lib
suppressPackageStartupMessages(expr = library(package = "sessioninfo"))
# CRAN Tidyverse
suppressPackageStartupMessages(expr = library(package = "ggplot2"))
suppressPackageStartupMessages(expr = library(package = "readr"))
suppressPackageStartupMessages(expr = library(package = "tibble"))
# Bioconductor
suppressPackageStartupMessages(expr = library(package = "AnnotationDbi"))
suppressPackageStartupMessages(expr = library(package = "BiocVersion"))
suppressPackageStartupMessages(expr = library(package = "Biostrings"))
suppressPackageStartupMessages(expr = library(package = "ChIPpeakAnno"))
suppressPackageStartupMessages(expr = library(package = "DiffBind"))
suppressPackageStartupMessages(expr = library(package = "GenomicFeatures"))
suppressPackageStartupMessages(expr = library(package = "rtracklayer"))

# Save plots in the following formats.

graphics_formats <- c("pdf" = "pdf", "png" = "png")

# Define the precedence of regions for the
# ChIPpeakAnno::assignChromosomeRegion() function so that each peak is assigned
# only once and all regions sum up to 100%.
region_precedence <- c("Promoters",
                       "immediateDownstream",
                       "fiveUTRs",
                       "threeUTRs",
                       "Exons",
                       "Introns")

prefix <-
  paste("chipseq",
        "diff",
        "bind",
        argument_list$comparison,
        argument_list$factor,
        sep = "_")

output_directory <-
  file.path(argument_list$output_directory, prefix)

if (!file.exists(output_directory)) {
  dir.create(path = output_directory,
             showWarnings = TRUE,
             recursive = FALSE)
}

# Load a TxDb object ------------------------------------------------------


message("Loading a TxDb object")
txdb_object <- AnnotationDbi::loadDb(file = argument_list$txdb_path)

# Initialise a DBA object -------------------------------------------------


diffbind_dba <- NULL
file_path <-
  file.path(argument_list$genome_directory,
            prefix,
            paste0(prefix, '_DBA.RData'))
if (file.exists(file_path) &&
    file.info(file_path)$size > 0L) {
  message("Loading a DiffBind DBA object")
  diffbind_dba <-
    DiffBind::dba.load(
      dir = file.path(argument_list$genome_directory, prefix),
      pre = paste0(prefix, "_")
    )
} else {
  stop("Require a pre-calculated DiffBind DBA object in file: ",
       file_path)
}
rm(file_path)

# Get the consensus peak set ----------------------------------------------


message("Loading a DiffBind peakset (GRanges) object")
diffbind_peakset_granges <-
  DiffBind::dba.peakset(
    DBA = diffbind_dba,
    bRetrieve = TRUE,
    DataType = DiffBind::DBA_DATA_GRANGES
  )

# Assign chromosome regions -----------------------------------------------


message("Assigning chromosome regions: ",
        length(diffbind_peakset_granges))
chromosome_region_list <-
  ChIPpeakAnno::assignChromosomeRegion(peaks.RD = diffbind_peakset_granges,
                                       precedence = region_precedence,
                                       TxDb = txdb_object)

message("Plotting chromosome regions")
plot_paths <- file.path(output_directory,
                        paste(
                          paste(prefix,
                                "peak",
                                "set",
                                "regions",
                                sep = "_"),
                          graphics_formats,
                          sep = "."
                        ))

ggplot_object <-
  ggplot2::ggplot(data = tibble::tibble(
    name = dimnames(x = chromosome_region_list$percentage)$subjectHits,
    percentage = as.double(x = chromosome_region_list$percentage)
  ))

ggplot_object <-
  ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$name, y = .data$percentage))

ggplot_object <-
  ggplot_object + ggplot2::labs(
    x = "Name",
    y = "Percentage",
    title = "Chromosome Regions",
    subtitle = "Combined Peak Set"
  )

ggplot_object <-
  ggplot_object + ggplot2::theme(axis.text.x = ggplot2::element_text(
    size = ggplot2::rel(x = 0.8),
    hjust = 0.0,
    vjust = 0.5,
    angle = 90.0
  ))

for (plot_path in plot_paths) {
  ggplot2::ggsave(
    filename = plot_path,
    plot = ggplot_object,
    width = argument_list$plot_width,
    height = argument_list$plot_height,
    limitsize = FALSE
  )
}

# Write the chromosome region fractions to a TSV file.
readr::write_tsv(
  x = ggplot_object$data,
  file = file.path(output_directory,
                   paste(
                     paste(prefix,
                           "peak",
                           "set",
                           "regions",
                           sep = "_"),
                     "tsv",
                     sep = "."
                   )),
  col_names = TRUE
)
rm(plot_path, ggplot_object, plot_paths, chromosome_region_list)

# Initialise reference annotation -----------------------------------------


message("Reading reference GTF gene features")
gene_granges <-
  rtracklayer::import(
    con = argument_list$gtf_reference,
    format = "GTF",
    # genome = ,
    feature.type = "gene"
  )

message("Creating annotation frame")
gene_frame <-
  S4Vectors::as.data.frame(x = GenomicRanges::mcols(x = gene_granges)[, c("gene_id",
                                                                          "gene_version",
                                                                          "gene_name",
                                                                          "gene_biotype",
                                                                          "gene_source")])

# Add the gene location as an Ensembl-like location, lacking the coordinate
# system name and version.
gene_frame$gene_location <-
  as.character(x = gene_granges)

rm(gene_granges)

utils::write.table(
  x = gene_frame,
  file = file.path(output_directory,
                   paste(
                     paste(prefix, "gene", "set", sep = "_"), "tsv", sep = "."
                   )),
  sep = "\t",
  col.names = TRUE,
  row.names = FALSE
)

# Annotate the consensus peak set -----------------------------------------


message("Preparing annotation")
# Annotation based on a TxDb object.
# The MultiDb::OrganimsDb object would be used to provide gene symbols, but a
# matching version with regards to the assembly and the annotation is not always
# available. Hence, information is joined from the meta annotation columns of
# the GTF-based GenomicRanges::GRanges object.

annotation_granges <-
  ChIPpeakAnno::toGRanges(data = txdb_object, feature = "gene")

message("Annotating the consensus peak set: ",
        length(x = diffbind_peakset_granges))
annotated_granges <-
  ChIPpeakAnno::annotatePeakInBatch(myPeakList = diffbind_peakset_granges,
                                    AnnotationData = annotation_granges)

annotated_frame <-
  GenomicRanges::as.data.frame(x = annotated_granges, stringsAsFactors = FALSE)
rm(annotated_granges)

# Merge with the GTF meta annotation columns.
merged_frame <-
  base::merge.data.frame(x = annotated_frame,
                         y = gene_frame,
                         by.x = "feature",
                         by.y = "gene_id")
rm(annotated_frame)

# Order by the double "peak" variable.
merged_frame <-
  merged_frame[base::order(as.double(x = merged_frame$peak)),]

utils::write.table(
  x = merged_frame,
  file = file.path(output_directory,
                   paste(
                     paste(prefix,
                           "peak",
                           "set",
                           "genes",
                           "complete",
                           sep = "_"),
                     "tsv",
                     sep = "."
                   )),
  sep = "\t",
  row.names = FALSE,
  col.names = TRUE
)
rm(merged_frame)
# print(x = "Consensus peak set annotation warnings:")
# print(x = base::warnings())

#' Process a contrasts data frame obtained via DiffBind::dba.show() per row.
#'
#' @param contrast contrast frame row name indicating the contrast number
#' @param group1 A \code{character} scalar of contrast group 1
#' @param group2 A \code{character} scalar of contrast group 2
#' @param db_number An \code{integer} scalar with the number of differentially
#'   bound sites
#'
#' @return TRUE
#' @export
#'
#' @examples
process_per_contrast <-
  function(contrast, group1, group2, db_number) {
    message(
      sprintf(
        fmt = "Processing %s %s contrast %s versus %s",
        argument_list$comparison,
        argument_list$factor,
        group1,
        group2
      )
    )

    report_granges <- NULL
    base::tryCatch(
      expr = {
        report_granges <- DiffBind::dba.report(
          DBA = diffbind_dba,
          contrast = as.integer(x = contrast),
          th = 1.0,
          bNormalized = TRUE,
          bCalled = TRUE,
          bCounts = TRUE,
          bCalledDetail = TRUE,
          DataType = DiffBind::DBA_DATA_GRANGES
        )
      },
      error = function(cond) {
        message(
          "DiffBind::dba.report failed ",
          sprintf(
            "for comparison %s, factor %s and contrast %s versus %s ",
            argument_list$comparison,
            argument_list$factor,
            group1,
            group2
          ),
          "with message:\n",
          cond,
          appendLF = TRUE
        )
      }
    )

    message(
      sprintf(
        fmt = "  Annotating %s %s contrast %s versus %s peak set: %d",
        argument_list$comparison,
        argument_list$factor,
        group1,
        group2,
        length(x = report_granges)
      )
    )

    annotated_granges <-
      ChIPpeakAnno::annotatePeakInBatch(myPeakList = report_granges,
                                        AnnotationData = annotation_granges)
    # print(x = "Contrast peak set annotation warnings:")
    # print(x = base::warnings())

    annotated_frame <-
      GenomicRanges::as.data.frame(x = annotated_granges, stringsAsFactors = FALSE)
    rm(annotated_granges)

    merged_frame <-
      base::merge.data.frame(
        x = annotated_frame,
        y = gene_frame,
        by.x = "feature",
        by.y = "gene_id"
      )
    rm(annotated_frame)

    # Calculate ranks for ...
    # (1) the effect size (Fold), ...
    merged_frame$rank_fold_change <-
      base::rank(x = -abs(x = merged_frame$Fold),
                 ties.method = c("min"))

    # (2) the absolute level (Conc) and ...
    merged_frame$rank_conc <-
      base::rank(x = -merged_frame$Conc,
                 ties.method = c("min"))

    # (3) the statistical significance (padj).
    merged_frame$rank_fdr <-
      base::rank(x = merged_frame$FDR, ties.method = c("min"))

    # Order by the double "peak" variable.
    merged_frame <-
      merged_frame[base::order(as.double(x = merged_frame$peak)),]

    utils::write.table(
      x = merged_frame,
      file = file.path(
        output_directory,
        sprintf(
          "%s_peaks_%s__%s_genes_complete.tsv",
          prefix,
          group1,
          group2
        )
      ),
      sep = "\t",
      row.names = FALSE,
      col.names = TRUE
    )

    # Filter by the FDR threshold value.
    merged_frame <-
      merged_frame[merged_frame$FDR <= argument_list$fdr_threshold,]

    utils::write.table(
      x = merged_frame,
      file = file.path(
        output_directory,
        sprintf(
          "%s_peaks_%s__%s_genes_significant.tsv",
          prefix,
          group1,
          group2
        )
      ),
      sep = "\t",
      row.names = FALSE,
      col.names = TRUE
    )

    rm(merged_frame)

    significant_granges <-
      report_granges[report_granges$FDR <= argument_list$fdr_threshold,]
    message(sprintf(fmt = "  Assigning chromosome regions for significant peak set: %d",
                    length(x = significant_granges)))

    chromosome_region_list <-
      ChIPpeakAnno::assignChromosomeRegion(peaks.RD = significant_granges,
                                           precedence = region_precedence,
                                           TxDb = txdb_object)

    if (is.null(x = dimnames(x = chromosome_region_list$percentage)$subjectHits)) {
      warning("  Skipping an empty chromosome regions plot")
    } else {
      message("  Creating a chromosome regions plot")
      plot_paths <- file.path(
        output_directory,
        sprintf(
          "%s_peaks_%s__%s_regions.%s",
          prefix,
          group1,
          group2,
          graphics_formats
        )
      )

      ggplot_object <-
        ggplot2::ggplot(data = tibble::tibble(
          name = dimnames(x = chromosome_region_list$percentage)$subjectHits,
          percentage = as.double(x = chromosome_region_list$percentage)
        ))

      ggplot_object <-
        ggplot_object + ggplot2::geom_point(mapping = ggplot2::aes(x = .data$name, y = .data$percentage))

      ggplot_object <-
        ggplot_object + ggplot2::labs(
          x = "Name",
          y = "Percentage",
          title = "Chromosome Regions",
          subtitle = paste("Peak Set", group1, group2, sep = " ")
        )

      ggplot_object <-
        ggplot_object + ggplot2::theme(
          axis.text.x = ggplot2::element_text(
            size = ggplot2::rel(x = 0.8),
            hjust = 0.0,
            vjust = 0.5,
            angle = 90.0
          )
        )

      for (plot_path in plot_paths) {
        ggplot2::ggsave(
          filename = plot_path,
          plot = ggplot_object,
          width = argument_list$plot_width,
          height = argument_list$plot_height,
          limitsize = FALSE
        )
      }
      # Write the fractions to a TSV file.
      readr::write_tsv(
        x = ggplot_object$data,
        file = file.path(
          output_directory,
          sprintf("%s_peaks_%s__%s_regions.tsv",
                  prefix,
                  group1,
                  group2)
        ),
        col_names = TRUE
      )
      rm(plot_path,
         ggplot_object,
         plot_paths)
    }
    rm(chromosome_region_list,
       significant_granges,
       report_granges)
  }

# Get a data frame with all contrasts to apply the above function to each row.
contrast_frame <-
  DiffBind::dba.show(DBA = diffbind_dba, bContrasts = TRUE)

# Replace '!' characters with 'not_'.

# DiffBind3 seems to use "Group", while DiffBind2 used "Group1".
group1_name <-
  if ("Group" %in% base::names(x = contrast_frame)) {
    "Group"
  } else {
    "Group1"
  }

contrast_frame[, group1_name] <-
  base::gsub(pattern = "!",
             replacement = "not_",
             x = contrast_frame[, group1_name])

contrast_frame$Group2 <-
  base::gsub(pattern = "!",
             replacement = "not_",
             x = contrast_frame$Group2)

return_value <-
  base::mapply(
    FUN = process_per_contrast,
    base::row.names(contrast_frame),
    contrast_frame[, group1_name],
    contrast_frame$Group2,
    # Since column 5 (DB.DESeq2) is a factor, it needs converting into a
    # character, before converting into an integer.
    as.integer(x = as.character(x = contrast_frame[, 5L]))
  )
rm(return_value,
   contrast_frame,
   group1_name,
   process_per_contrast)

rm(
  annotation_granges,
  gene_frame,
  diffbind_peakset_granges,
  diffbind_dba,
  txdb_object,
  output_directory,
  prefix,
  region_precedence,
  graphics_formats,
  argument_list
)

print(x = "Final warnings:")
print(x = base::warnings())

message("All done")

# Finally, print all objects that have not been removed from the environment.
if (length(x = ls())) {
  print(x = ls())
}

print(x = sessioninfo::session_info())