bsfR: BSF R package

#!/usr/bin/env Rscript
#
# Copyright 2013 - 2022 Michael K. Schuster
#
# Biomedical Sequencing Facility (BSF), part of the genomics core facility of
# the Research Center for Molecular Medicine (CeMM) of the Austrian Academy of
# Sciences and the Medical University of Vienna (MUW).
#
#
# This file is part of BSF R.
#
# BSF R is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# (at your option) any later version.
#
# BSF R is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with BSF R.  If not, see <http://www.gnu.org/licenses/>.


# Description -------------------------------------------------------------


# BSF R Script to import a GenomicRanges::GRanges object.
#
# The --drop-variables option allows for specifying metadata variables
# that should be dropped from the GenomicRanges::GRanges object.
#
# Since Ensembl provides multiple "tag" attributes per GTF line, the value of
# only one of them can be included in the meta data DataFrame. Since a single
# "tag" value renders the annotation misleading, the "tag" variable can be
# dropped.
#
# Please note that Ensembl provides a singe tag attribute in GFF3 files with one
# or more comma-separated tag values, which can be parsed into
# IRanges::CharacterList objects.
#
# Output file:
# - <output_directory>/<transcriptome_version>_granges.rds
# - <output_directory>/<transcriptome_version>.bed

# Option Parsing ----------------------------------------------------------


suppressPackageStartupMessages(expr = library(package = "optparse"))

argument_list <-
  optparse::parse_args(object = optparse::OptionParser(
    option_list = list(
      optparse::make_option(
        opt_str = "--verbose",
        action = "store_true",
        default = TRUE,
        help = "Print extra output [default]",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--quiet",
        action = "store_false",
        default = FALSE,
        dest = "verbose",
        help = "Print little output",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--transcriptome-path",
        dest = "transcriptome_path",
        help = "Transcriptome (GTF) path",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--transcriptome-version",
        dest = "transcriptome_version",
        help = "Transcriptome version",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--seqinfo-path",
        dest = "seqinfo_path",
        help = "GenomeInfoDb::Seqinfo path (preferred over --genome-version)",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--genome-version",
        dest = "genome_version",
        help = "Genome version (--seqinfo-path is preferred)",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--drop-variables",
        dest = "drop_variables",
        help = "A comma-separated list of meta data variables to drop",
        type = "character"
      ),
      optparse::make_option(
        opt_str = "--complete-hierarchy",
        action = "store_true",
        default = TRUE,
        dest = "complete_hierarchy",
        help = "Complete the exon, transcript, gene feature hierarchy [TRUE]",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--incomplete-hierarchy",
        action = "store_false",
        default = FALSE,
        dest = "complete_hierarchy",
        help = "Leave only exon features [FALSE]",
        type = "logical"
      ),
      optparse::make_option(
        opt_str = "--output-directory",
        default = ".",
        dest = "output_directory",
        help = "Output directory path [.]",
        type = "character"
      )
    )
  ))

if (is.null(x = argument_list$transcriptome_path)) {
  stop("Missing --transcriptome-path option")
}

if (is.null(x = argument_list$transcriptome_version)) {
  stop("Missing --transcriptome-version option")
}

if (is.null(x = argument_list$genome_version) &&
    is.null(x = argument_list$seqinfo_path)) {
  stop(
    "Either the --genome-version or the --seqinfo-path option ",
    "is required for output file naming."
  )
}

# Library Import ----------------------------------------------------------


# CRAN r-lib
suppressPackageStartupMessages(expr = library(package = "sessioninfo"))
# CRAN Tidyverse
suppressPackageStartupMessages(expr = library(package = "readr"))
suppressPackageStartupMessages(expr = library(package = "stringr"))
# Bioconductor
suppressPackageStartupMessages(expr = library(package = "BiocVersion"))
suppressPackageStartupMessages(expr = library(package = "rtracklayer"))

seqinfo_object <-
  if (is.null(x = argument_list$seqinfo_path)) {
    GenomeInfoDb::Seqinfo(genome = argument_list$genome_version)
  } else {
    readr::read_rds(file = argument_list$seqinfo_path)
  }

if (is.null(x = seqinfo_object)) {
  stop("Could not retrieve a GenomeInfoDb::Seqinfo object")
}

granges_object <-
  rtracklayer::import(con = argument_list$transcriptome_path,
                      genome = seqinfo_object)

# Drop metadata variables if requested and present.
if (!is.null(argument_list$drop_variables)) {
  # Split on "," characters.
  drop_variables_character <-
    stringr::str_split_1(string = argument_list$drop_variables,
                         pattern = stringr::fixed(pattern = ","))

  # Trim remaining white space around "," characters.
  drop_variables_character <-
    stringr::str_trim(string = drop_variables_character)

  # Only subset if the variable is not just a single empty string.
  if ((length(x = drop_variables_character) > 1) ||
      (drop_variables_character[1L] != "")) {
    if (argument_list$verbose) {
      message("Removing metadata variables: ",
              paste(drop_variables_character, collapse = " "))
    }

    mcols_dframe <- S4Vectors::mcols(x = granges_object)

    mcols_dframe <- S4Vectors::subset(
      x = mcols_dframe,
      select = base::setdiff(x = S4Vectors::colnames(x = mcols_dframe),
                             y = drop_variables_character),
      drop = FALSE
    )

    S4Vectors::mcols(x = granges_object) <- mcols_dframe
    rm(mcols_dframe)
  }
  rm(drop_variables_character)
}

if (argument_list$complete_hierarchy) {
  # Complete the exon -> transcript -> gene hierarchy, by adding missing
  # transcript or gene features.
  mcols_dframe <- S4Vectors::mcols(x = granges_object)
  if ("type" %in% S4Vectors::colnames(x = mcols_dframe)) {
    # Check if the GTF file has only "exon" features. If so, create also
    # "transcript" and "gene" features.
    if (sum(mcols_dframe$type == "exon") == S4Vectors::nrow(x = mcols_dframe)) {
      # Only "exon records are available.
      transcript_granges <-
        granges_object

      transcript_mcols_dframe <-
        S4Vectors::mcols(x = transcript_granges)

      transcript_mcols_dframe$type <- "transcript"

      if ("exon_id" %in% S4Vectors::colnames(x = transcript_mcols_dframe)) {
        transcript_mcols_dframe$exon_id <- NA_character_
      }

      S4Vectors::mcols(x = transcript_granges) <-
        transcript_mcols_dframe
      rm(transcript_mcols_dframe)

      gene_granges <- transcript_granges

      gene_mcols_dframe <- S4Vectors::mcols(x = gene_granges)

      gene_mcols_dframe$type <- "gene"

      if ("transcript_id" %in% S4Vectors::colnames(x = gene_mcols_dframe)) {
        gene_mcols_dframe$transcript_id <- NA_character_
      }

      S4Vectors::mcols(x = gene_granges) <- gene_mcols_dframe
      rm(gene_mcols_dframe)

      # Merge "exon", "transcript" and "gene" GRanges.
      granges_object <-
        c(granges_object, transcript_granges, gene_granges)

      rm(transcript_granges, gene_granges)

      granges_object <- GenomicRanges::sort(x = granges_object)
    }
  }
  rm(mcols_dframe)
}

# Add names to the GenomicRanges::GRanges object.

mcols_dframe <- S4Vectors::mcols(x = granges_object)
if ("type" %in% S4Vectors::colnames(x = mcols_dframe)) {
  row_names <- base::names(x = granges_object)

  if (is.null(x = row_names)) {
    row_names <- S4Vectors::rownames(x = mcols_dframe)
  }

  if (is.null(x = row_names)) {
    row_names <-
      as.character(x = base::seq_len(length.out = base::length(x = granges_object)))
  }

  # Since exon features are not unique, because the same exon can be part of
  # more than one transcript, row names can not be assigned in a simple loop
  # over feature types.
  if ("gene" %in% mcols_dframe$type &&
      "gene_id" %in% S4Vectors::colnames(x = mcols_dframe)) {
    row_index <- mcols_dframe$type == "gene"
    row_names[row_index] <-
      mcols_dframe[row_index, "gene_id", drop = TRUE]
    rm(row_index)
  }

  if ("transcript" %in% mcols_dframe$type &&
      "transcript_id" %in% S4Vectors::colnames(x = mcols_dframe)) {
    row_index <- mcols_dframe$type == "transcript"
    row_names[row_index] <-
      mcols_dframe[row_index, "transcript_id", drop = TRUE]
    rm (row_index)

    if ("exon" %in% mcols_dframe$type &&
        "exon_id" %in% S4Vectors::colnames(x = mcols_dframe)) {
      row_index <- mcols_dframe$type == "exon"
      row_names[row_index] <-
        paste(mcols_dframe[row_index, "transcript_id", drop = TRUE],
              mcols_dframe[row_index, "exon_id", drop = TRUE],
              sep = "_")
      rm(row_index)
    }
  }

  # Assign to the DataFrame object.
  S4Vectors::rownames(x = mcols_dframe) <- row_names

  # Assign to the GRanges object.
  S4Vectors::mcols(x = granges_object) <- mcols_dframe

  base::names(x = granges_object) <- row_names

  rm(row_names)
}
rm(mcols_dframe)

readr::write_rds(
  x = granges_object,
  file = file.path(argument_list$output_directory,
                   paste(
                     paste(argument_list$transcriptome_version, "granges", sep = "_"),
                     "rds",
                     sep = "."
                   )),
  compress = "gz"
)

# Re-export the merged GenomicRanges::GRanges objects in GTF format.
# Since the rtracklayer::export() function checks for a single genome, multiple
# ones need collapsing.
merged_seqinfo_object <- GenomicRanges::seqinfo(x = granges_object)

GenomeInfoDb::genome(x = merged_seqinfo_object) <-
  paste(base::unique(x = GenomeInfoDb::genome(x = merged_seqinfo_object)), collapse = "_")

GenomicRanges::seqinfo(x = granges_object) <- merged_seqinfo_object

rm(merged_seqinfo_object)

rtracklayer::export(object = granges_object,
                    con = file.path(
                      argument_list$output_directory,
                      paste(argument_list$transcriptome_version,
                            "gtf",
                            sep = ".")
                    ))

rm(granges_object, seqinfo_object, argument_list)

message("All done")

# Finally, print all objects that have not been removed from the environment.
if (length(x = ls())) {
  print(x = "Remaining objects:")
  print(x = ls())
}

print(x = sessioninfo::session_info())