#!/usr/bin/env Rscript
#
# Copyright 2013 - 2022 Michael K. Schuster
#
# Biomedical Sequencing Facility (BSF), part of the genomics core facility of
# the Research Center for Molecular Medicine (CeMM) of the Austrian Academy of
# Sciences and the Medical University of Vienna (MUW).
#
#
# This file is part of BSF R.
#
# BSF R is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BSF R is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with BSF R. If not, see <http://www.gnu.org/licenses/>.
# Description -------------------------------------------------------------
# BSF R script to post-processes Cufflinks output by enriching gene and
# transcript tables with Ensembl annotation, either downloaded from BioMart or
# imported from a reference GTF file.
# Option Parsing ----------------------------------------------------------
suppressPackageStartupMessages(expr = library(package = "optparse"))
argument_list <-
optparse::parse_args(object = optparse::OptionParser(
option_list = list(
optparse::make_option(
opt_str = "--verbose",
action = "store_true",
default = TRUE,
help = "Print extra output [default]",
type = "logical"
),
optparse::make_option(
opt_str = "--quiet",
action = "store_false",
default = FALSE,
dest = "verbose",
help = "Print little output",
type = "logical"
),
optparse::make_option(
opt_str = "--gtf-reference",
default = NULL,
dest = "gtf_reference",
help = "GTF file specifying a reference transcriptome",
type = "character"
),
optparse::make_option(
opt_str = "--genome-version",
default = NULL,
dest = "genome_version",
help = "Genome version",
type = "character"
),
optparse::make_option(
opt_str = "--biomart-instance",
default = "ENSEMBL_MART_ENSEMBL",
dest = "biomart_instance",
help = "BioMart instance [ENSEMBL_MART_ENSEMBL]",
type = "character"
),
optparse::make_option(
opt_str = "--biomart-data-set",
dest = "biomart_data_set",
help = "BioMart data set",
type = "character"
),
optparse::make_option(
opt_str = "--biomart-host",
dest = "biomart_host",
default = "www.ensembl.org",
help = "BioMart host [www.ensembl.org]",
type = "character"
),
optparse::make_option(
opt_str = "--biomart-port",
default = 80L,
dest = "biomart_port",
help = "BioMart port [80]",
type = "integer"
),
optparse::make_option(
opt_str = "--biomart-path",
default = "/biomart/martservice",
dest = "biomart_path",
help = "BioMart path [/biomart/martservice]",
type = "character"
),
optparse::make_option(
opt_str = "--pattern-path",
default = "/rnaseq_cufflinks_.*$",
dest = "pattern_path",
help = "Cufflinks directory path pattern [/rnaseq_cufflinks_.*$]",
type = "character"
),
optparse::make_option(
opt_str = "--pattern-sample",
default = ".*/rnaseq_cufflinks_(.*)$",
dest = "pattern_sample",
help = "Cufflinks sample name pattern [.*/rnaseq_cufflinks_(.*)$]",
type = "character"
),
optparse::make_option(
opt_str = "--genome-directory",
default = ".",
dest = "genome_directory",
help = "Genome directory path [.]",
type = "character"
),
optparse::make_option(
opt_str = "--output-directory",
default = ".",
dest = "output_directory",
help = "Output directory path [.]",
type = "character"
),
optparse::make_option(
opt_str = "--plot-width",
default = 7.0,
dest = "plot_width",
help = "Plot width in inches [7.0]",
type = "double"
),
optparse::make_option(
opt_str = "--plot-height",
default = 7.0,
dest = "plot_height",
help = "Plot height in inches [7.0]",
type = "double"
)
)
))
if (is.null(x = argument_list$biomart_data_set)) {
# If a --biomart-data-set was not specified, ...
if (is.null(x = argument_list$gtf_reference)) {
# ... a --gtf-reference option must be specified.
stop("Missing --gtf-reference or --biomart-data-set option")
}
}
# Library Import ----------------------------------------------------------
# CRAN r-lib
suppressPackageStartupMessages(expr = library(package = "sessioninfo"))
# CRAN Tidyverse
suppressPackageStartupMessages(expr = library(package = "dplyr"))
suppressPackageStartupMessages(expr = library(package = "readr"))
suppressPackageStartupMessages(expr = library(package = "tibble"))
# Save plots in the following formats.
graphics_formats <- c("pdf" = "pdf", "png" = "png")
gene_annotation_tibble <- NULL
isoform_annotation_tibble <- NULL
if (is.null(x = argument_list$biomart_data_set)) {
# Import GTF file-based annotation --------------------------------------
if (is.null(x = argument_list$gtf_reference)) {
stop("Missing --gtf-reference option")
}
# If a --genome-version option was not provided, set it to NA.
if (is.null(x = argument_list$genome_version)) {
argument_list$genome_version <- NA_character_
}
suppressPackageStartupMessages(expr = library(package = "Biostrings"))
suppressPackageStartupMessages(expr = library(package = "rtracklayer"))
message("Import GTF annotation file")
reference_granges <- rtracklayer::import(
con = argument_list$gtf_reference,
format = "gtf",
genome = argument_list$genome_version,
feature.type = "transcript"
)
reference_mcols <- S4Vectors::mcols(x = reference_granges)
# Standard GTF attributes
gene_annotation_tibble <- tibble::tibble(
"ensembl_gene_id" = reference_mcols$gene_id,
"ensembl_gene_version" = reference_mcols$gene_version,
"ensembl_gene_name" = reference_mcols$gene_name,
"ensembl_gene_source" = reference_mcols$gene_source,
"ensembl_gene_biotype" = reference_mcols$gene_biotype
# "havana_gene" = if ("havana_gene" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_gene
# else
# NA_character_,
# "havana_gene_version" = if ("havana_gene_version" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_gene_version
# else
# NA_character_
)
# Optional GTF attributes
# Havana annotation is only avaliable for earlier Ensembl releases.
if ("havana_gene" %in% S4Vectors::colnames(x = reference_mcols)) {
gene_annotation_tibble$havana_gene <-
reference_mcols$havana_gene
}
if ("havana_gene_version" %in% S4Vectors::colnames(x = reference_mcols)) {
gene_annotation_tibble$havana_gene_version <-
reference_mcols$havana_gene_version
}
# Gene information is available for each transcript.
gene_annotation_tibble <-
dplyr::distinct(.data = gene_annotation_tibble)
# Standard GTF attributes
isoform_annotation_tibble <- tibble::tibble(
"ensembl_transcript_id" = reference_mcols$transcript_id,
"ensembl_transcript_version" = reference_mcols$transcript_version,
"ensembl_transcript_name" = reference_mcols$transcript_name,
"ensembl_transcript_source" = reference_mcols$transcript_source,
"ensembl_transcript_biotype" = reference_mcols$transcript_biotype,
"ensembl_gene_id" = reference_mcols$gene_id,
"ensembl_gene_version" = reference_mcols$gene_version,
"ensembl_gene_name" = reference_mcols$gene_name,
"ensembl_gene_source" = reference_mcols$gene_source,
"ensembl_gene_biotype" = reference_mcols$gene_biotype,
"ccds_id" = if ("ccds_id" %in% S4Vectors::colnames(x = reference_mcols))
reference_mcols$ccds_id
else
NA_character_,
"tag" = if ("tag" %in% S4Vectors::colnames(x = reference_mcols))
reference_mcols$tag
else
NA_character_
# "havana_transcript" = if ("havana_transcript" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_transcript
# else
# NA_character_,
# "havana_transcript_version" = if ("havana_transcript_version" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_transcript_version
# else
# NA_character_,
# "havana_transcript_support_level" = if ("havana_transcript_support_level" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_transcript_support_level
# else
# NA_character_,
# "havana_gene" = if ("havana_gene" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_gene
# else
# NA_character_,
# "havana_gene_version" = if ("havana_gene_version" %in% S4Vectors::colnames(x = reference_mcols))
# reference_mcols$havana_gene_version
# else
# NA_character_
)
# Optional GTF attributes
# if ("ccds_id" %in% S4Vectors::colnames(x = reference_mcols)) {
# isoform_annotation_tibble$ccds_id <-
# reference_mcols$ccds_id
# }
#
# if ("tag" %in% S4Vectors::colnames(x = reference_mcols)) {
# isoform_annotation_tibble$tag <-
# reference_mcols$tag
# }
#
# Havana annotation is only available for earlier Ensembl releases.
if ("havana_transcript" %in% S4Vectors::colnames(x = reference_mcols)) {
isoform_annotation_tibble$havana_transcript <-
reference_mcols$havana_transcript
}
if ("havana_transcript_version" %in% S4Vectors::colnames(x = reference_mcols)) {
isoform_annotation_tibble$havana_transcript_version <-
reference_mcols$havana_transcript_version
}
if ("havana_transcript_support_level" %in% S4Vectors::colnames(x = reference_mcols)) {
isoform_annotation_tibble$havana_transcript_support_level <-
reference_mcols$havana_transcript_support_level
}
if ("havana_gene" %in% S4Vectors::colnames(x = reference_mcols)) {
isoform_annotation_tibble$havana_gene <-
reference_mcols$havana_gene
}
if ("havana_gene_version" %in% S4Vectors::colnames(x = reference_mcols)) {
isoform_annotation_tibble$havana_gene_version <-
reference_mcols$havana_gene_version
}
rm(reference_mcols, reference_granges)
} else {
# Import BioMart-based annotation ---------------------------------------
suppressPackageStartupMessages(expr = library(package = "biomaRt"))
# Connect to the Ensembl BioMart.
message("Connect to BioMart")
ensembl_mart <- biomaRt::useMart(
biomart = argument_list$biomart_instance,
dataset = argument_list$biomart_data_set,
host = argument_list$biomart_host,
path = argument_list$biomart_path,
port = argument_list$biomart_port
)
message("Loading attribute data from BioMart")
ensembl_attributes <- biomaRt::listAttributes(
mart = ensembl_mart,
page = "feature_page",
what = c("name", "description", "page")
)
# Get Ensembl Gene information. From Ensembl version e75, the attributes
# "external_gene_id" and "external_gene_db" are called "external_gene_name" and
# "external_gene_source", respectively.
if ("external_gene_id" %in% ensembl_attributes$name) {
# Pre e75.
ensembl_gene_attributes <- c("ensembl_gene_id",
"external_gene_id",
"external_gene_db",
"gene_biotype")
} else if ("external_gene_name" %in% ensembl_attributes$name) {
# Post e75.
ensembl_gene_attributes <- c("ensembl_gene_id",
"external_gene_name",
"external_gene_db",
"gene_biotype")
} else {
stop(
"Neither external_gene_id nor external_gene_name are available as BioMart attributes."
)
}
message("Loading gene data from BioMart")
gene_annotation_tibble <-
tibble::as_tibble(x = biomaRt::getBM(attributes = ensembl_gene_attributes, mart = ensembl_mart))
rm(ensembl_gene_attributes)
# Get Ensembl Transcript information.
if ("external_gene_id" %in% ensembl_attributes$name) {
# Pre e75.
ensembl_transcript_attributes <- c(
"ensembl_transcript_id",
"external_transcript_id",
"transcript_db_name",
"transcript_biotype",
"ensembl_gene_id",
"external_gene_id",
"external_gene_db",
"gene_biotype"
)
} else if ("external_gene_name" %in% ensembl_attributes$name) {
# Post e75.
ensembl_transcript_attributes <- c(
"ensembl_transcript_id",
"external_transcript_name",
"transcript_db_name",
"transcript_biotype",
"ensembl_gene_id",
"external_gene_name",
"external_gene_db",
"gene_biotype"
)
} else {
stop(
"Neither external_gene_id nor external_gene_name are available as BioMart attributes."
)
}
message("Loading transcript data from BioMart")
isoform_annotation_tibble <-
tibble::as_tibble(x = biomaRt::getBM(attributes = ensembl_transcript_attributes, mart = ensembl_mart))
rm(ensembl_transcript_attributes)
# Destroy and thus disconnect the Ensembl BioMart connection already here.
rm(ensembl_mart, ensembl_attributes)
}
# Process all "rnaseq_cufflinks_*" directories in the current working directory.
message("Processing Cufflinks reports for sample:")
sample_tibble <- tibble::tibble(
# R character vector of directory paths.
"directory_path" = grep(
pattern = argument_list$pattern_path,
x = list.dirs(
path = argument_list$genome_directory,
full.names = TRUE,
recursive = FALSE
),
value = TRUE
),
# R character vector of sample names.
"sample_name" = base::gsub(
pattern = argument_list$pattern_sample,
replacement = "\\1",
x = .data$directory_path
)
)
# Add further variables for all FPKM_status bio types and states.
for (biotype in c("genes", "isoforms")) {
for (fpkm_status in c("OK", "LOWDATA", "HIDATA", "FAIL")) {
sample_tibble[, paste("FPKM_status", biotype, fpkm_status, sep = "_")] <-
0L
}
rm(fpkm_status)
}
rm(biotype)
for (i in seq_len(length.out = base::nrow(x = sample_tibble))) {
message(" ", sample_tibble$sample_name[i])
# Construct sample-specific prefixes for Cufflinks and Tophat directories.
prefix_cufflinks <-
paste("rnaseq", "cufflinks", sample_tibble$sample_name[i], sep = "_")
output_directory <-
file.path(argument_list$output_directory, prefix_cufflinks)
if (!file.exists(output_directory)) {
dir.create(path = output_directory,
showWarnings = TRUE,
recursive = FALSE)
}
# Read, summarise, merge and write gene (genes.fpkm_tracking) tables.
tracking_tibble <-
dplyr::group_by(
.data = readr::read_tsv(
file = file.path(
argument_list$genome_directory,
prefix_cufflinks,
"genes.fpkm_tracking"
),
col_names = TRUE,
col_types = readr::cols(
"tracking_id" = readr::col_character(),
# Not populated for genes and isoforms.
"class_code" = readr::col_factor(),
# Not populated for genes and isoforms.
"nearest_ref_id" = readr::col_character(),
"gene_id" = readr::col_character(),
# Not populated for genes and isoforms.
"gene_short_name" = readr::col_character(),
# Not populated for genes and isoforms.
"tss_id" = readr::col_character(),
"locus" = readr::col_character(),
# Not populated (i.e. "-") for genes, read as factor.
"length" = readr::col_factor(),
# Not populated (i.e. "-") for genes, read as factor.
"coverage" = readr::col_factor(),
"FPKM" = readr::col_double(),
"FPKM_conf_lo" = readr::col_double(),
"FPKM_conf_hi" = readr::col_double(),
"FPKM_status" = readr::col_factor()
)
),
.data$FPKM_status
)
# Summarise the FPKM_status variable and drop the grouping in the summary tibble.
fpkm_status_tibble <-
dplyr::summarise(
.data = tracking_tibble,
"FPKM_status_count" = dplyr::n(),
.groups = "drop"
)
fpkm_status_tibble <-
dplyr::mutate(
.data = fpkm_status_tibble,
"FPKM_status_variable" = paste("FPKM_status_genes", .data$FPKM_status, sep = "_")
)
for (j in seq_along(along.with = fpkm_status_tibble$FPKM_status_count)) {
sample_tibble[i, fpkm_status_tibble$FPKM_status_variable[j]] <-
fpkm_status_tibble$FPKM_status_count[j]
}
rm(j, fpkm_status_tibble)
readr::write_tsv(
x = dplyr::left_join(
x = gene_annotation_tibble,
y = tracking_tibble,
by = c("ensembl_gene_id" = "tracking_id")
),
file = file.path(
output_directory,
paste(prefix_cufflinks, "genes_fpkm_tracking.tsv", sep = "_")
)
)
rm(tracking_tibble)
# Read, summarize, merge and write transcript (isoforms.fpkm_tracking) tables.
tracking_tibble <-
dplyr::group_by(
.data = readr::read_tsv(
file = file.path(
argument_list$genome_directory,
prefix_cufflinks,
"isoforms.fpkm_tracking"
),
col_names = TRUE,
col_types = readr::cols(
"tracking_id" = readr::col_character(),
# Not populated for genes and isoforms.
"class_code" = readr::col_factor(),
# Not populated for genes and isoforms.
"nearest_ref_id" = readr::col_character(),
"gene_id" = readr::col_character(),
# Not populated for genes and isoforms.
"gene_short_name" = readr::col_character(),
# Not populated for genes and isoforms.
"tss_id" = readr::col_character(),
"locus" = readr::col_character(),
# Not populated for genes.
"length" = readr::col_integer(),
# Not populated for genes.
"coverage" = readr::col_number(),
"FPKM" = readr::col_double(),
"FPKM_conf_lo" = readr::col_double(),
"FPKM_conf_hi" = readr::col_double(),
"FPKM_status" = readr::col_factor()
)
),
.data$FPKM_status
)
# Summarise the FPKM_status variable and drop the grouping in the summary tibble.
fpkm_status_tibble <-
dplyr::summarise(
.data = tracking_tibble,
"FPKM_status_count" = dplyr::n(),
.groups = "drop"
)
fpkm_status_tibble <-
dplyr::mutate(
.data = fpkm_status_tibble,
"FPKM_status_variable" = paste("FPKM_status_isoforms", .data$FPKM_status, sep = "_")
)
for (j in seq_along(along.with = fpkm_status_tibble$FPKM_status_count)) {
sample_tibble[i, fpkm_status_tibble$FPKM_status_variable[j]] <-
fpkm_status_tibble$FPKM_status_count[j]
}
rm(j, fpkm_status_tibble)
readr::write_tsv(
x = dplyr::left_join(
x = isoform_annotation_tibble,
y = tracking_tibble,
by = c("ensembl_transcript_id" = "tracking_id")
),
file = file.path(
output_directory,
paste(prefix_cufflinks, "isoforms_fpkm_tracking.tsv", sep = "_")
)
)
rm(tracking_tibble,
output_directory,
prefix_cufflinks)
}
rm(i)
readr::write_tsv(
x = sample_tibble,
file = file.path(
argument_list$output_directory,
"rnaseq_cufflinks_summary.tsv"
)
)
rm(
sample_tibble,
isoform_annotation_tibble,
gene_annotation_tibble,
graphics_formats,
argument_list
)
message("All done")
# Finally, print all objects that have not been removed from the environment.
if (length(x = ls())) {
print(x = ls())
}
print(x = sessioninfo::session_info())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.