#' Add the transcript version
#'
#' Append the transcript version to the identifier (e.g. ENST00000000233.10).
#'
#' @note Updated 2019-09-05.
#' @noRd
.addTxVersion <- function(object) {
message("Adding version to transcript identifiers.")
mcolnames <- colnames(mcols(object))
assert(
is(object, "GRanges"),
identical(metadata(object)[["level"]], "transcripts"),
isSubset("transcriptID", mcolnames)
)
if (isSubset("transcriptIDVersion", mcolnames)) {
## `makeGRangesFromEnsembl()` output via ensembldb.
id <- mcols(object)[["transcriptIDVersion"]]
} else if (isSubset("transcriptVersion", mcolnames)) {
## `makeGRangesFromGFF()` output.
id <- mcols(object)[["transcriptID"]]
version <- mcols(object)[["transcriptVersion"]]
id <- Rle(paste(id, version, sep = "."))
} else {
stop("Failed to locate transcript version metadata.") # nolint
}
mcols(object)[["transcriptID"]] <- id
## Note that names are set by `.makeGRanges()`.
object
}
## Note that this intentionally prioritizes transcripts over genes.
.detectGRangesIDs <- function(object) {
if (is(object, "GRangesList")) {
object <- object[[1L]]
}
assert(is(object, "GRanges"))
mcolnames <- colnames(mcols(object))
if ("transcriptID" %in% mcolnames) {
"transcriptID"
} else if ("transcript_id" %in% mcolnames) {
"transcript_id"
} else if ("geneID" %in% mcolnames) {
"geneID"
} else if ("gene_id" %in% mcolnames) {
"gene_id"
} else {
stop("Failed to detect ID column.")
}
}
## This is the main GRanges final return generator, used by
## `makeGRangesFromEnsembl()` and `makeGRangesFromGFF()`.
.makeGRanges <- function(object, ignoreTxVersion = TRUE) {
assert(
is(object, "GRanges"),
hasNames(object),
hasLength(object),
isFlag(ignoreTxVersion)
)
## Stash object length to ensure no dropping is occurring.
length <- length(object)
## Minimize the object, removing unnecessary metadata columns.
object <- .minimizeGRanges(object)
assert(identical(length(object), length))
## Now we're ready to standardize the metadata naming conventions.
object <- .standardizeGRanges(object)
assert(identical(length(object), length))
## Prepare the metadata.
## Slot organism into metadata.
object <- .slotOrganism(object)
## Ensure object contains prototype metadata.
metadata(object) <- c(.prototypeMetadata, metadata(object))
## Ensure the transcript identifier includes the version, if desired. This
## is useful for tx2gene handling with tximport.
if (identical(ignoreTxVersion, FALSE)) {
object <- .addTxVersion(object)
}
idCol <- .detectGRangesIDs(object)
assert(isSubset(idCol, colnames(mcols(object))))
names <- as.character(mcols(object)[[idCol]])
assert(!any(is.na(names)))
## Inform the user if the object contains invalid names, showing offenders.
invalid <- setdiff(names, make.names(names, unique = TRUE))
if (hasLength(invalid)) {
invalid <- sort(unique(invalid))
message(sprintf(
"%d invalid %s: %s.",
length(invalid),
ngettext(
n = length(invalid),
msg1 = "name",
msg2 = "names"
),
toString(invalid, width = 100L)
))
}
rm(invalid)
## Split into GRangesList if object contains multiple ranges per feature.
if (hasDuplicates(names)) {
message(sprintf(
fmt = paste0(
"GRanges contains multiple ranges per '%s'.\n",
"Splitting into GRangesList."
),
idCol
))
## Metadata will get dropped during `split()` call; stash and reassign.
metadata <- metadata(object)
object <- split(x = object, f = as.factor(names))
metadata(object) <- metadata
rm(metadata)
} else {
names(object) <- names
}
## Ensure the ranges are sorted by gene identifier.
object <- object[sort(names(object))]
## Inform the user about the number of features returned.
level <- match.arg(
arg = metadata(object)[["level"]],
choices = c("genes", "transcripts")
)
message(sprintf(
"%d %s detected.",
length(object),
ngettext(
n = length(object),
msg1 = substr(level, 1L, nchar(level) - 1L), # gene
msg2 = level # genes
)
))
object
}
## Remove uninformative metadata columns from GFF3 before return.
## Always remove columns beginning with a capital letter.
## - Ensembl: Alias, ID, Name, Parent
## - GENCODE: ID, Parent
## - RefSeq: Dbxref, Gap, ID, Name, Note, Parent, Target
.minimizeGFF3 <- function(object) {
assert(is(object, "GRanges"))
mcols <- mcols(object)
mcolnames <- colnames(mcols)
## Remove all columns beginning with a capital letter.
keep <- !grepl("^[A-Z]", mcolnames)
mcolnames <- mcolnames[keep]
## Remove additional blacklisted columns.
blacklist <- "biotype"
mcolnames <- setdiff(mcolnames, blacklist)
## Subset the metadata columns.
mcols <- mcols[, mcolnames, drop = FALSE]
mcols(object) <- mcols
object
}
## This step drops extra columns in `mcols()` and applies run-length encoding,
## to reduce memory overhead.
##
## Note that `removeNA()` call currently will error on complex columns.
## For example, this will error on `CharacterList` columns returned from
## GENCODE GFF3 file.
.minimizeGRanges <- function(object) {
assert(is(object, "GRanges"))
mcols <- mcols(object)
## Drop any complex S4 columns that aren't either atomic or list.
keep <- bapply(
X = mcols,
FUN = function(x) {
is.atomic(x) || is.list(x)
}
)
mcols <- mcols[, keep, drop = FALSE]
## Ensure NA values are properly set, prior to `removeNA()` call.
mcols <- sanitizeNA(mcols)
## Remove columns that are all `NA`. This step will remove all
## transcript-level columns from gene-level ranges.
mcols <- removeNA(mcols)
## Apply run-length encoding on all atomic columns.
mcols <- lapply(
X = mcols,
FUN = function(x) {
if (isS4(x) || is(x, "AsIs") || !is.atomic(x)) {
## `I()` inhibits reinterpretation and returns `AsIs` class.
## This keeps complex columns (e.g. Entrez list) intact.
## Recommended in the `DataFrame` documentation.
I(x)
} else {
## Ensure factor levels get correctly reset, to save memory.
if (is.factor(x)) {
x <- droplevels(x)
}
## Use S4 run length encoding (Rle) for atomic metadata columns.
## Many of these elements are repetitive, and this makes
## operations faster.
Rle(x)
}
}
)
## `lapply()` returns as list, so we need to coerce back to DataFrame.
mcols <- as(mcols, "DataFrame")
mcols(object) <- mcols
object
}
## Standardize the GRanges into desired conventions used in basejump. Note that
## this step makes GRanges imported via `rtracklayer::import()` incompatible
## with `GenomicFeatures::makeTxDbFromGRanges()`.
.standardizeGRanges <- function(object) {
assert(is(object, "GRanges"))
## Standardize the metadata columns.
mcols <- mcols(object)
## Use `transcript` prefix instead of `tx` consistently.
colnames(mcols) <- gsub(
pattern = "^tx_",
replacement = "transcript_",
x = colnames(mcols)
)
## Ensure "ID" is always capitalized (e.g. "entrezid").
colnames(mcols) <- gsub(
pattern = "(.+)id$",
replacement = "\\1ID",
x = colnames(mcols)
)
## Always return using camel case, even though GFF/GTF files use snake.
colnames(mcols) <- camelCase(colnames(mcols))
## Always use `geneName` instead of `symbol`.
## Note that ensembldb output duplicates these.
if (all(c("geneName", "symbol") %in% colnames(mcols))) {
mcols[["symbol"]] <- NULL
} else if ("symbol" %in% colnames(mcols)) {
message("Renaming 'symbol' to 'geneName' in 'mcols()'.")
mcols[["geneName"]] <- mcols[["symbol"]]
mcols[["symbol"]] <- NULL
}
## Re-slot updated mcols back into object before calculating broad class
## biotype and/or assigning names.
mcols(object) <- mcols
## Ensure broad class definitions are included, using run-length encoding.
## Don't pass `mcols` to `.broadClass()`, use the GRanges instead because
## we need the corresponding seqnames for calculations.
mcols(object)[["broadClass"]] <- Rle(.broadClass(object))
## Finally, sort the metadata columns alphabetically.
mcols(object) <-
mcols(object)[, sort(colnames(mcols(object))), drop = FALSE]
## Ensure the ranges are sorted by identifier.
idCol <- .detectGRangesIDs(object)
message(sprintf("Arranging by '%s'.", idCol))
names(object) <- mcols(object)[[idCol]]
object <- object[sort(names(object))]
object
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.