R/makeGRanges-internal.R

Defines functions .addTxVersion .detectGRangesIDs .makeGRanges .minimizeGFF3 .minimizeGRanges .standardizeGRanges

#' Add the transcript version
#'
#' Append the transcript version to the identifier (e.g. ENST00000000233.10).
#'
#' @note Updated 2019-09-05.
#' @noRd
.addTxVersion <- function(object) {
    message("Adding version to transcript identifiers.")
    mcolnames <- colnames(mcols(object))
    assert(
        is(object, "GRanges"),
        identical(metadata(object)[["level"]], "transcripts"),
        isSubset("transcriptID", mcolnames)
    )
    if (isSubset("transcriptIDVersion", mcolnames)) {
        ## `makeGRangesFromEnsembl()` output via ensembldb.
        id <- mcols(object)[["transcriptIDVersion"]]
    } else if (isSubset("transcriptVersion", mcolnames)) {
        ## `makeGRangesFromGFF()` output.
        id <- mcols(object)[["transcriptID"]]
        version <- mcols(object)[["transcriptVersion"]]
        id <- Rle(paste(id, version, sep = "."))
    } else {
        stop("Failed to locate transcript version metadata.")  # nolint
    }
    mcols(object)[["transcriptID"]] <- id
    ## Note that names are set by `.makeGRanges()`.
    object
}



## Note that this intentionally prioritizes transcripts over genes.
.detectGRangesIDs <- function(object) {
    if (is(object, "GRangesList")) {
        object <- object[[1L]]
    }
    assert(is(object, "GRanges"))
    mcolnames <- colnames(mcols(object))
    if ("transcriptID" %in% mcolnames) {
        "transcriptID"
    } else if ("transcript_id" %in% mcolnames) {
        "transcript_id"
    } else if ("geneID" %in% mcolnames) {
        "geneID"
    } else if ("gene_id" %in% mcolnames) {
        "gene_id"
    } else {
        stop("Failed to detect ID column.")
    }
}



## This is the main GRanges final return generator, used by
## `makeGRangesFromEnsembl()` and `makeGRangesFromGFF()`.
.makeGRanges <- function(object, ignoreTxVersion = TRUE) {
    assert(
        is(object, "GRanges"),
        hasNames(object),
        hasLength(object),
        isFlag(ignoreTxVersion)
    )
    ## Stash object length to ensure no dropping is occurring.
    length <- length(object)
    ## Minimize the object, removing unnecessary metadata columns.
    object <- .minimizeGRanges(object)
    assert(identical(length(object), length))
    ## Now we're ready to standardize the metadata naming conventions.
    object <- .standardizeGRanges(object)
    assert(identical(length(object), length))
    ## Prepare the metadata.
    ## Slot organism into metadata.
    object <- .slotOrganism(object)
    ## Ensure object contains prototype metadata.
    metadata(object) <- c(.prototypeMetadata, metadata(object))
    ## Ensure the transcript identifier includes the version, if desired. This
    ## is useful for tx2gene handling with tximport.
    if (identical(ignoreTxVersion, FALSE)) {
        object <- .addTxVersion(object)
    }
    idCol <- .detectGRangesIDs(object)
    assert(isSubset(idCol, colnames(mcols(object))))
    names <- as.character(mcols(object)[[idCol]])
    assert(!any(is.na(names)))
    ## Inform the user if the object contains invalid names, showing offenders.
    invalid <- setdiff(names, make.names(names, unique = TRUE))
    if (hasLength(invalid)) {
        invalid <- sort(unique(invalid))
        message(sprintf(
            "%d invalid %s: %s.",
            length(invalid),
            ngettext(
                n = length(invalid),
                msg1 = "name",
                msg2 = "names"
            ),
            toString(invalid, width = 100L)
        ))
    }
    rm(invalid)
    ## Split into GRangesList if object contains multiple ranges per feature.
    if (hasDuplicates(names)) {
        message(sprintf(
            fmt = paste0(
                "GRanges contains multiple ranges per '%s'.\n",
                "Splitting into GRangesList."
            ),
            idCol
        ))
        ## Metadata will get dropped during `split()` call; stash and reassign.
        metadata <- metadata(object)
        object <- split(x = object, f = as.factor(names))
        metadata(object) <- metadata
        rm(metadata)
    } else {
        names(object) <- names
    }
    ## Ensure the ranges are sorted by gene identifier.
    object <- object[sort(names(object))]
    ## Inform the user about the number of features returned.
    level <- match.arg(
        arg = metadata(object)[["level"]],
        choices = c("genes", "transcripts")
    )
    message(sprintf(
        "%d %s detected.",
        length(object),
        ngettext(
            n = length(object),
            msg1 = substr(level, 1L, nchar(level) - 1L),  # gene
            msg2 = level                                  # genes
        )
    ))
    object
}



## Remove uninformative metadata columns from GFF3 before return.
## Always remove columns beginning with a capital letter.
## - Ensembl: Alias, ID, Name, Parent
## - GENCODE: ID, Parent
## - RefSeq: Dbxref, Gap, ID, Name, Note, Parent, Target
.minimizeGFF3 <- function(object) {
    assert(is(object, "GRanges"))
    mcols <- mcols(object)
    mcolnames <- colnames(mcols)
    ## Remove all columns beginning with a capital letter.
    keep <- !grepl("^[A-Z]", mcolnames)
    mcolnames <- mcolnames[keep]
    ## Remove additional blacklisted columns.
    blacklist <- "biotype"
    mcolnames <- setdiff(mcolnames, blacklist)
    ## Subset the metadata columns.
    mcols <- mcols[, mcolnames, drop = FALSE]
    mcols(object) <- mcols
    object
}



## This step drops extra columns in `mcols()` and applies run-length encoding,
## to reduce memory overhead.
##
## Note that `removeNA()` call currently will error on complex columns.
## For example, this will error on `CharacterList` columns returned from
## GENCODE GFF3 file.
.minimizeGRanges <- function(object) {
    assert(is(object, "GRanges"))
    mcols <- mcols(object)
    ## Drop any complex S4 columns that aren't either atomic or list.
    keep <- bapply(
        X = mcols,
        FUN = function(x) {
            is.atomic(x) || is.list(x)
        }
    )
    mcols <- mcols[, keep, drop = FALSE]
    ## Ensure NA values are properly set, prior to `removeNA()` call.
    mcols <- sanitizeNA(mcols)
    ## Remove columns that are all `NA`. This step will remove all
    ## transcript-level columns from gene-level ranges.
    mcols <- removeNA(mcols)
    ## Apply run-length encoding on all atomic columns.
    mcols <- lapply(
        X = mcols,
        FUN = function(x) {
            if (isS4(x) || is(x, "AsIs") || !is.atomic(x)) {
                ## `I()` inhibits reinterpretation and returns `AsIs` class.
                ## This keeps complex columns (e.g. Entrez list) intact.
                ## Recommended in the `DataFrame` documentation.
                I(x)
            } else {
                ## Ensure factor levels get correctly reset, to save memory.
                if (is.factor(x)) {
                    x <- droplevels(x)
                }
                ## Use S4 run length encoding (Rle) for atomic metadata columns.
                ## Many of these elements are repetitive, and this makes
                ## operations faster.
                Rle(x)
            }
        }
    )
    ## `lapply()` returns as list, so we need to coerce back to DataFrame.
    mcols <- as(mcols, "DataFrame")
    mcols(object) <- mcols
    object
}



## Standardize the GRanges into desired conventions used in basejump. Note that
## this step makes GRanges imported via `rtracklayer::import()` incompatible
## with `GenomicFeatures::makeTxDbFromGRanges()`.
.standardizeGRanges <- function(object) {
    assert(is(object, "GRanges"))
    ## Standardize the metadata columns.
    mcols <- mcols(object)
    ## Use `transcript` prefix instead of `tx` consistently.
    colnames(mcols) <- gsub(
        pattern = "^tx_",
        replacement = "transcript_",
        x = colnames(mcols)
    )
    ## Ensure "ID" is always capitalized (e.g. "entrezid").
    colnames(mcols) <- gsub(
        pattern = "(.+)id$",
        replacement = "\\1ID",
        x = colnames(mcols)
    )
    ## Always return using camel case, even though GFF/GTF files use snake.
    colnames(mcols) <- camelCase(colnames(mcols))
    ## Always use `geneName` instead of `symbol`.
    ## Note that ensembldb output duplicates these.
    if (all(c("geneName", "symbol") %in% colnames(mcols))) {
        mcols[["symbol"]] <- NULL
    } else if ("symbol" %in% colnames(mcols)) {
        message("Renaming 'symbol' to 'geneName' in 'mcols()'.")
        mcols[["geneName"]] <- mcols[["symbol"]]
        mcols[["symbol"]] <- NULL
    }
    ## Re-slot updated mcols back into object before calculating broad class
    ## biotype and/or assigning names.
    mcols(object) <- mcols
    ## Ensure broad class definitions are included, using run-length encoding.
    ## Don't pass `mcols` to `.broadClass()`, use the GRanges instead because
    ## we need the corresponding seqnames for calculations.
    mcols(object)[["broadClass"]] <- Rle(.broadClass(object))
    ## Finally, sort the metadata columns alphabetically.
    mcols(object) <-
        mcols(object)[, sort(colnames(mcols(object))), drop = FALSE]
    ## Ensure the ranges are sorted by identifier.
    idCol <- .detectGRangesIDs(object)
    message(sprintf("Arranging by '%s'.", idCol))
    names(object) <- mcols(object)[[idCol]]
    object <- object[sort(names(object))]
    object
}
acidgenomics/freerange documentation built on Jan. 8, 2020, 3:45 a.m.