R/internal-rtracklayer.R

Defines functions .rtracklayerWormbaseTranscriptsGtf .rtracklayerWormbaseGenesGtf .rtracklayerRefseqTranscriptsGtf .rtracklayerRefseqTranscriptsGff .rtracklayerRefseqGenesGtf .rtracklayerRefseqGenesGff .getNcbiGeneIdsFromRefSeqGtf .getNcbiGeneIdsFromRefSeqGff .rtracklayerGencodeTranscriptsGtf .rtracklayerGencodeTranscriptsGff .rtracklayerGencodeGenesGtf .rtracklayerGencodeGenesGff .rtracklayerFlybaseTranscriptsGtf .rtracklayerFlybaseGenesGtf .rtracklayerEnsemblTranscriptsGtf .rtracklayerEnsemblTranscriptsGff .rtracklayerEnsemblGenesGtf .rtracklayerEnsemblGenesGff .makeGRangesFromRtracklayer

#' Make GRanges from rtracklayer
#'
#' @note Updated 2023-10-12.
#' @noRd
#'
#' @details
#' The `"meta"` argument is generated by `makeGRangesFromGff`.
.makeGRangesFromRtracklayer <-
    function(file,
             level,
             ignoreVersion,
             extraMcols,
             meta) {
        assert(
            isString(file),
            isFlag(ignoreVersion),
            isFlag(extraMcols),
            is.list(meta),
            isString(meta[["format"]]),
            isString(meta[["provider"]])
        )
        level <- match.arg(
            arg = level,
            choices = c("genes", "transcripts")
        )
        meta[["level"]] <- level
        gr <- import(con = .cacheIt(file))
        assert(is(gr, "GRanges"))
        format <- ifelse(
            test = grepl(pattern = "GTF", x = meta[["format"]]),
            yes = "GTF",
            no = "GFF"
        )
        provider <- match.arg(
            arg = meta[["provider"]],
            choices = c(
                "Ensembl",
                "FlyBase",
                "GENCODE",
                "RefSeq",
                "UCSC",
                "WormBase"
            )
        )
        funName <- paste0(
            ".",
            camelCase(
                object = tolower(paste("rtracklayer", provider, level, format)),
                strict = TRUE
            )
        )
        tryCatch(
            expr = {
                what <- .getFun(funName)
            },
            error = function(e) {
                abort(sprintf("Unsupported GFF: {.file %s}.", basename(file)))
            }
        )
        gr <- do.call(what = what, args = list("object" = gr))
        metadata(gr) <- meta
        seqinfo <- .getSeqinfo(meta)
        if (is(seqinfo, "Seqinfo")) {
            seqinfo(gr) <- seqinfo[seqlevels(gr)]
        }
        .makeGRanges(
            object = gr,
            ignoreVersion = ignoreVersion,
            extraMcols = extraMcols
        )
    }



## Ensembl =====================================================================

## Updated 2022-05-04.
.rtracklayerEnsemblGenesGff <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("Name", "biotype", "gene_id", "version"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c("gene_biotype", "gene_id_version", "gene_name"),
                y = names(mcols(object))
            )
        )
        keep <- !is.na(mcols(object)[["gene_id"]])
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
        names(mcols(object))[
            names(mcols(object)) == "Name"
        ] <- "gene_name"
        names(mcols(object))[
            names(mcols(object)) == "biotype"
        ] <- "gene_biotype"
        mcols(object)[["gene_id_version"]] <-
            paste(
                mcols(object)[["gene_id"]],
                mcols(object)[["version"]],
                sep = "."
            )
        mcols(object)[["Alias"]] <- NULL
        mcols(object)[["ID"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["version"]] <- NULL
        object
    }



## Updated 2022-05-04.
.rtracklayerEnsemblGenesGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c(
                    "gene_id",
                    "gene_version",
                    "type"
                ),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = "gene_id_version",
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
        mcols(object)[["gene_id_version"]] <-
            paste(
                mcols(object)[["gene_id"]],
                mcols(object)[["gene_version"]],
                sep = "."
            )
        mcols(object)[["gene_version"]] <- NULL
        object
    }



## Updated 2022-05-04.
.rtracklayerEnsemblTranscriptsGff <-
    function(object) {
        genes <- .rtracklayerEnsemblGenesGff(object)
        mcols(genes) <- removeNa(mcols(genes))
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("Name", "Parent", "biotype"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c("transcript_biotype", "transcript_name"),
                y = names(mcols(object))
            )
        )
        keep <- !is.na(mcols(object)[["transcript_id"]])
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        assert(
            hasNoDuplicates(mcols(object)[["transcript_id"]]),
            allAreMatchingRegex(
                x = as.character(mcols(object)[["Parent"]]),
                pattern = "^gene:"
            )
        )
        mcols(object) <- removeNa(mcols(object))
        names(mcols(object))[
            names(mcols(object)) == "biotype"
        ] <- "transcript_biotype"
        names(mcols(object))[
            names(mcols(object)) == "Name"
        ] <- "transcript_name"
        mcols(object)[["gene_id"]] <- gsub(
            pattern = "^gene:",
            replacement = "",
            x = as.character(mcols(object)[["Parent"]])
        )
        mcols(object)[["transcript_id_version"]] <-
            paste(
                mcols(object)[["transcript_id"]],
                mcols(object)[["version"]],
                sep = "."
            )
        mcols(object)[["Alias"]] <- NULL
        mcols(object)[["ID"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["version"]] <- NULL
        geneCols <- c(
            "gene_id",
            setdiff(
                x = names(mcols(genes)),
                y = names(mcols(object))
            )
        )
        mcols(object) <- leftJoin(
            x = mcols(object),
            y = mcols(genes)[geneCols],
            by = "gene_id"
        )
        object
    }



## Updated 2022-05-04.
.rtracklayerEnsemblTranscriptsGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c(
                    "gene_id",
                    "gene_version",
                    "transcript_id",
                    "transcript_version",
                    "type"
                ),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c(
                    "gene_id_version",
                    "transcript_id_version"
                ),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "transcript"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        mcols(object)[["gene_id_version"]] <-
            paste(
                mcols(object)[["gene_id"]],
                mcols(object)[["gene_version"]],
                sep = "."
            )
        mcols(object)[["transcript_id_version"]] <-
            paste(
                mcols(object)[["transcript_id"]],
                mcols(object)[["transcript_version"]],
                sep = "."
            )
        mcols(object)[["gene_version"]] <- NULL
        mcols(object)[["transcript_version"]] <- NULL
        object
    }



## FlyBase =====================================================================

## Updated 2021-01-27.
.rtracklayerFlybaseGenesGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("gene_id", "type"),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
        object
    }



## Updated 2021-01-27.
.rtracklayerFlybaseTranscriptsGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("transcript_id", "type"),
                y = names(mcols(object))
            )
        )
        keep <- grepl(
            pattern = paste(c("^pseudogene$", "RNA$"), collapse = "|"),
            x = mcols(object)[["type"]],
            ignore.case = TRUE
        )
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        assert(hasNoDuplicates(mcols(object)[["transcript_id"]]))
        object
    }



## GENCODE =====================================================================

## Uses `gene_type` instead of `gene_biotype`.
## Note that `gene_id` and `gene_name` are nicely defined, so don't use `Name`.
## Consider removing gene and transcript versions automatically.

## Updated 2023-11-29.
.rtracklayerGencodeGenesGff <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("ID", "gene_id", "type"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c(
                    "gene_id_no_version",
                    "gene_version"
                ),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        mcols(object)[["gene_id_version"]] <- mcols(object)[["ID"]]
        assert(hasNoDuplicates(mcols(object)[["gene_id_version"]]))
        mcols(object)[["gene_id"]] <-
            stripGeneVersions(mcols(object)[["gene_id_version"]])
        mcols(object)[["ID"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["ont"]] <- NULL
        object
    }



## Updated 2022-05-04.
.rtracklayerGencodeGenesGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("gene_id", "type"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c("gene_id_version", "gene_version"),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        mcols(object)[["gene_id_version"]] <- mcols(object)[["gene_id"]]
        assert(hasNoDuplicates(mcols(object)[["gene_id_version"]]))
        mcols(object)[["gene_id"]] <-
            stripGeneVersions(mcols(object)[["gene_id_version"]])
        mcols(object)[["ont"]] <- NULL
        object
    }



## Updated 2023-11-29.
.rtracklayerGencodeTranscriptsGff <-
    function(object) {
        assert(
            isSubset(
                x = c(
                    "ID",
                    "gene_id",
                    "transcript_id",
                    "type"
                ),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c(
                    "transcript_id_no_version",
                    "transcript_version"
                ),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "transcript"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        mcols(object)[["transcript_id_version"]] <- mcols(object)[["ID"]]
        assert(hasNoDuplicates(mcols(object)[["transcript_id_version"]]))
        mcols(object)[["transcript_id"]] <-
            stripTranscriptVersions(mcols(object)[["transcript_id_version"]])
        mcols(object)[["gene_id_version"]] <-
            as.character(mcols(object)[["Parent"]])
        mcols(object)[["gene_id"]] <-
            stripGeneVersions(mcols(object)[["gene_id_version"]])
        mcols(object)[["ID"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["ont"]] <- NULL
        object
    }


## GENCODE GRCh37 contains duplicate unversioned transcript identifiers.
##
## Updated 2023-11-29.
.rtracklayerGencodeTranscriptsGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c(
                    "gene_id",
                    "transcript_id",
                    "type"
                ),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = c(
                    "gene_id_version",
                    "gene_version",
                    "transcript_id_version",
                    "transcript_version"
                ),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "transcript"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        mcols(object)[["transcript_id_version"]] <-
            mcols(object)[["transcript_id"]]
        assert(hasNoDuplicates(mcols(object)[["transcript_id_version"]]))
        mcols(object)[["transcript_id"]] <-
            stripTranscriptVersions(mcols(object)[["transcript_id_version"]])
        mcols(object)[["gene_id_version"]] <- mcols(object)[["gene_id"]]
        mcols(object)[["gene_id"]] <-
            stripGeneVersions(mcols(object)[["gene_id_version"]])
        mcols(object)[["ont"]] <- NULL
        object
    }



## RefSeq ======================================================================

#' Extract NCBI gene identifiers from RefSeq GFF/GTF file.
#'
#' @section GCF_000001405.40_GRCh38.p14 genes with multiple NCBI identifiers:
#'
#' - TRNAA-AGC: 124901561, 124901562, 124901563, 124901564, 124901565
#' - TRNAE-UUC: 107987368, 124905580, 124905583, 124905584, 124905586
#' - TRNAG-CCC: 124905578, 124905581, 124905588
#' - TRNAN-GUU: 124905579, 124905582, 124905585, 124905587
#' - TRNAV-CAC: 107985614, 107985615
#'
#' Note that these are RefSeq status: MODEL.
#'
#' @note Updated 2023-03-01.
#' @noRd
.getNcbiGeneIdsFromRefSeqGff <- function(object) {
    mcols <- mcols(object)[, c("ID", "Dbxref")]
    geneIds <- mcols[["ID"]]
    dbxref <- mcols[["Dbxref"]]
    lgl <- grepl(pattern = "GeneID:", x = dbxref, fixed = TRUE)
    assert(
        is(dbxref, "CharacterList"),
        is(lgl, "LogicalList")
    )
    ncbiGeneIds <- unlist(dbxref[lgl], recursive = FALSE)
    assert(identical(length(ncbiGeneIds), length(geneIds)))
    ncbiGeneIds <- do.call(
        what = rbind,
        args = strsplit(x = ncbiGeneIds, split = ":", fixed = TRUE)
    )[, 2L]
    ncbiGeneIds <- as.integer(ncbiGeneIds)
    df <- DataFrame("ID" = geneIds, "ncbi_gene_id" = ncbiGeneIds)
    df <- unique(df[complete.cases(df), ])
    assert(hasNoDuplicates(df[["ID"]]))
    df
}



#' Extract NCBI gene identifiers from RefSeq GTF file.
#'
#' @note Updated 2023-03-01.
#' @noRd
.getNcbiGeneIdsFromRefSeqGtf <- function(object) {
    mcols <- mcols(object)[, c("gene_id", "db_xref")]
    keep <- grepl(pattern = "GeneID:", x = mcols[["db_xref"]], fixed = TRUE)
    mcols <- unique(mcols[keep, ])
    assert(hasNoDuplicates(mcols[["gene_id"]]))
    geneIds <- mcols[["gene_id"]]
    ncbiGeneIds <- strMatch(
        x = mcols[["db_xref"]],
        pattern = "GeneID:([[:digit:]]+)"
    )[, 2L]
    ncbiGeneIds <- as.integer(ncbiGeneIds)
    df <- DataFrame("gene_id" = geneIds, "ncbi_gene_id" = ncbiGeneIds)
    df <- df[complete.cases(df), ]
    df
}



## Updated 2022-05-04.
.rtracklayerRefseqGenesGff <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("ID", "Parent", "description", "gene"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = "gene_id",
                y = names(mcols(object))
            )
        )
        keep <- !is.na(mcols(object)[["gbkey"]]) &
            mcols(object)[["gbkey"]] == "Gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        ncbiGeneIds <- .getNcbiGeneIdsFromRefSeqGff(object)
        mcols(object) <- leftJoin(x = mcols(object), y = ncbiGeneIds, by = "ID")
        names(mcols(object))[names(mcols(object)) == "ID"] <- "parent_gene_id"
        assert(hasNoDuplicates(mcols(object)[["parent_gene_id"]]))
        names(mcols(object))[names(mcols(object)) == "gene"] <- "gene_id"
        assert(all(grepl(
            pattern = "^gene-",
            x = mcols(object)[["parent_gene_id"]]
        )))
        mcols(object)[["parent_gene_id"]] <- gsub(
            pattern = "^gene-",
            replacement = "",
            x = mcols(object)[["parent_gene_id"]]
        )
        mcols(object)[["Name"]] <- NULL
        mcols(object)[["Note"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["end_range"]] <- NULL
        mcols(object)[["experiment"]] <- NULL
        mcols(object)[["gbkey"]] <- NULL
        mcols(object)[["start_range"]] <- NULL
        mcols(object)[["transl_except"]] <- NULL
        object
    }



## Updated 2022-05-04.
.rtracklayerRefseqGenesGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("gene", "gene_id", "type"),
                y = names(mcols(object))
            )
        )
        ## Need to run this before gene filtering, because only exons in GTF
        ## file contain the NCBI gene identifiers.
        ncbiGeneIds <- .getNcbiGeneIdsFromRefSeqGtf(object)
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any genes."
        )
        object <- object[keep]
        mcols(object) <- leftJoin(
            x = mcols(object),
            y = ncbiGeneIds,
            by = "gene_id"
        )
        names(mcols(object))[
            names(mcols(object)) == "gene_id"
        ] <- "parent_gene_id"
        names(mcols(object))[
            names(mcols(object)) == "gene"
        ] <- "gene_id"
        assert(hasNoDuplicates(mcols(object)[["parent_gene_id"]]))
        mcols(object)[["gbkey"]] <- NULL
        mcols(object)[["note"]] <- NULL
        object
    }



## NOTE Consider including "ensemblTxId" from "dbxref" here.

## Updated 2023-03-01.
.rtracklayerRefseqTranscriptsGff <-
    function(object) {
        genes <- .rtracklayerRefseqGenesGff(object)
        genesMcols <- mcols(genes)[
            ,
            c(
                "description",
                "gene_biotype",
                "ncbi_gene_id",
                "parent_gene_id"
            ),
            drop = FALSE
        ]
        assert(
            hasNoDuplicates(genesMcols[["parent_gene_id"]]),
            is(object, "GRanges"),
            isSubset(
                x = c("Parent", "gene", "transcript_id"),
                y = names(mcols(object))
            ),
            areDisjointSets(
                x = "gene_id",
                y = names(mcols(object))
            )
        )
        keep <- !is.na(mcols(object)[["transcript_id"]])
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        ## e.g. "NM_000014.6".
        keep <- grepl(
            pattern = "^[A-Z]{2}_[0-9]+\\.[0-9]+$",
            x = mcols(object)[["transcript_id"]]
        )
        object <- object[keep]
        ## Only keep transcript annotations that map to a parent gene.
        ## This is somewhat slow and may be optimizable.
        keep <- bapply(
            X = mcols(object)[["Parent"]],
            FUN = function(x) {
                any(grepl(pattern = "^gene-", x = x))
            }
        )
        assert(
            any(keep),
            msg = "Failed to match transcripts against parent genes."
        )
        object <- object[keep]
        ## Ensure that matching transcripts contain a unique gene parent.
        assert(
            all(bapply(
                X = mcols(object)[["Parent"]],
                FUN = isScalar
            )),
            msg = "Elements do not contain a unique gene parent."
        )
        mcols(object)[["parent_gene_id"]] <- vapply(
            X = mcols(object)[["Parent"]],
            FUN = function(x) {
                sub(pattern = "^gene-", replacement = "", x = x[[1L]])
            },
            FUN.VALUE = character(1L),
            USE.NAMES = FALSE
        )
        names(mcols(object))[names(mcols(object)) == "gene"] <- "gene_id"
        cols <- c(
            setdiff(
                x = colnames(mcols(object)),
                y = colnames(genesMcols)
            ),
            "parent_gene_id"
        )
        mcols(object) <- mcols(object)[, cols]
        mcols(object) <- leftJoin(
            x = mcols(object),
            y = genesMcols,
            by = "parent_gene_id"
        )
        mcols(object)[["ID"]] <- NULL
        mcols(object)[["Name"]] <- NULL
        mcols(object)[["Note"]] <- NULL
        mcols(object)[["Parent"]] <- NULL
        mcols(object)[["end_range"]] <- NULL
        mcols(object)[["experiment"]] <- NULL
        mcols(object)[["gbkey"]] <- NULL
        mcols(object)[["start_range"]] <- NULL
        mcols(object)[["transl_except"]] <- NULL
        object
    }



## NOTE Consider including "ensemblTxId" from "dbxref" here.

## Updated 2023-03-01.
.rtracklayerRefseqTranscriptsGtf <-
    function(object) {
        genes <- .rtracklayerRefseqGenesGtf(object)
        genesMcols <- mcols(genes)[
            ,
            c(
                "description",
                "gene_biotype",
                "ncbi_gene_id",
                "parent_gene_id"
            ),
            drop = FALSE
        ]
        assert(
            hasNoDuplicates(genesMcols[["parent_gene_id"]]),
            is(object, "GRanges"),
            isSubset(
                x = c(
                    "transcript_id",
                    "type"
                ),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "transcript"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        ## e.g. "NM_000014.6".
        keep <- grepl(
            pattern = "^[A-Z]{2}_[0-9]+\\.[0-9]+$",
            x = mcols(object)[["transcript_id"]]
        )
        object <- object[keep]
        assert(hasNoDuplicates(mcols(object)[["transcript_id"]]))
        names(mcols(object))[
            names(mcols(object)) == "gene_id"
        ] <- "parent_gene_id"
        names(mcols(object))[
            names(mcols(object)) == "gene"
        ] <- "gene_id"
        cols <- c(
            setdiff(
                x = colnames(mcols(object)),
                y = colnames(genesMcols)
            ),
            "parent_gene_id"
        )
        mcols(object) <- mcols(object)[, cols]
        mcols(object) <- leftJoin(
            x = mcols(object),
            y = genesMcols,
            by = "parent_gene_id"
        )
        mcols(object)[["experiment"]] <- NULL
        mcols(object)[["gbkey"]] <- NULL
        mcols(object)[["note"]] <- NULL
        object
    }



## WormBase ====================================================================

## Updated 2021-08-05.
.rtracklayerWormbaseGenesGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("gene_id", "type"),
                y = names(mcols(object))
            )
        )
        keep <- mcols(object)[["type"]] == "gene"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        assert(
            hasNoDuplicates(mcols(object)[["gene_id"]]),
            allAreMatchingRegex(
                x = mcols(object)[["gene_id"]],
                pattern = "^WBGene[[:digit:]]{8}$"
            )
        )
        object
    }



## Updated 2021-08-05.
.rtracklayerWormbaseTranscriptsGtf <-
    function(object) {
        assert(
            is(object, "GRanges"),
            isSubset(
                x = c("gene_id", "transcript_id", "type"),
                y = names(mcols(object))
            )
        )
        ## Keep track of gene-level metadata, that we'll join below.
        genes <- .rtracklayerWormbaseGenesGtf(object)
        geneCols <- grep(
            pattern = "^gene_",
            x = colnames(mcols(genes)),
            value = TRUE
        )
        geneMcols <- mcols(genes, use.names = FALSE)[, geneCols]
        keep <- mcols(object)[["type"]] == "transcript"
        assert(
            any(keep),
            msg = "Failed to extract any transcripts."
        )
        object <- object[keep]
        assert(
            hasNoDuplicates(mcols(object)[["transcript_id"]]),
            allAreMatchingRegex(
                x = mcols(object)[["gene_id"]],
                pattern = "^WBGene[[:digit:]]{8}$"
            )
        )
        mcols <- mcols(object)
        cols <- c(
            setdiff(
                x = colnames(mcols),
                y = colnames(geneMcols)
            ),
            "gene_id"
        )
        mcols <- mcols[, cols]
        mcols <- leftJoin(x = mcols, y = geneMcols, by = "gene_id")
        mcols(object) <- mcols
        object
    }
acidgenomics/AcidGenomes documentation built on Dec. 10, 2023, 10:35 p.m.