GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in get_and_fix_chrom_info_from_UCSC getChromInfoFromUCSC registered_UCSC_genomes saveAssembledMoleculesInfoFromUCSC

### =========================================================================
### getChromInfoFromUCSC()
### -------------------------------------------------------------------------
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_NCBI_cols_to_UCSC_chrom_info()
###

.match_UCSC_seqlevels_to_NCBI_accns <- function(UCSC_seqlevels, NCBI_accns,
                                                trim.seqlevels=FALSE,
                                                accn.suffix="")
{
    if (trim.seqlevels) {
        UCSC_seqlevels <- sub("_(alt|random|fix)$", "", UCSC_seqlevels)
        UCSC_seqlevels <- sub("^[^_]*_", "", UCSC_seqlevels)
    }
    x <- chartr("v-", "..", UCSC_seqlevels)
    x <- paste0(x, accn.suffix)
    names(x) <- UCSC_seqlevels
    solid_match2(tolower(x), tolower(NCBI_accns),
                 x_what="UCSC seqlevel", table_what="accession number")
}

.match_UCSC_seqlevels_part2_to_NCBI_accns <-
    function(UCSC_seqlevels, NCBI_accns, part2.auto.version="", accn.prefix="")
{
    ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
    seqlevel_parts <- strsplit(UCSC_seqlevels, "_")
    nparts <- lengths(seqlevel_parts)
    idx2 <- which(nparts >= 2L)
    if (length(idx2) == 0L)
        return(ans)
    offsets <- c(0L, cumsum(nparts[idx2[-length(idx2)]]))
    x <- unlist(seqlevel_parts[idx2], use.names=FALSE)[offsets + 2L]
    x <- chartr("v", ".", x)
    unversioned_idx <- grep(".", x, fixed=TRUE, invert=TRUE)
    if (length(unversioned_idx) != 0L)
        x[unversioned_idx] <- paste0(x[unversioned_idx], part2.auto.version)
    x <- paste0(accn.prefix, x)
    names(x) <- UCSC_seqlevels[idx2]
    ans[idx2] <- solid_match2(tolower(x), tolower(NCBI_accns),
                              x_what="UCSC seqlevel",
                              table_what="accession number")
    ans
}

### The workhorse behind .add_NCBI_cols_to_UCSC_chrom_info().
### - All input vectors must be character vectors.
### - All NCBI input vectors must have the same length.
### - Vectors 'UCSC_seqlevels' and 'NCBI_seqlevels' must be "primary
###   keys" i.e. must not contain NAs, empty strings, or duplicates.
###   (No such assumption is made about the other input vectors.)
### Returns an integer vector parallel to 'UCSC_seqlevels'.
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
                                                  NCBI_seqlevels,
                                                  NCBI_UCSCStyleName,
                                                  NCBI_GenBankAccn,
                                                  NCBI_RefSeqAccn,
                                                  special_mappings=NULL)
{
    stopifnot(is.character(UCSC_seqlevels),
              is_primary_key(UCSC_seqlevels),
              is.character(NCBI_seqlevels),
              is_primary_key(NCBI_seqlevels),
              is.character(NCBI_UCSCStyleName),
              is.character(NCBI_GenBankAccn),
              is.character(NCBI_RefSeqAccn))

    L2R <- rep.int(NA_integer_, length(UCSC_seqlevels))
    unmapped_idx <- seq_along(L2R)

    ## 1. Handle special mappings.
    if (!is.null(special_mappings)) {
        m1 <- match(names(special_mappings), UCSC_seqlevels)
        if (anyNA(m1))
            stop(wmsg("'names(special_mappings)' contains sequence ",
                      "names not found in the UCSC genome"))
        m2 <- match(special_mappings, NCBI_seqlevels)
        if (anyNA(m2))
            stop(wmsg("'special_mappings' contains sequence ",
                      "names not found in the NCBI assembly"))
        L2R[m1] <- m2

        unmapped_idx <- which(is.na(L2R))
        if (length(unmapped_idx) == 0L)
            return(L2R)

        NCBI_UCSCStyleName[m2] <- NA_character_
    }

    ## 2. We assign based on exact match between 'UCSC_seqlevels'
    ##    and 'NCBI_UCSCStyleName'.
    m <- match(UCSC_seqlevels[unmapped_idx], NCBI_UCSCStyleName)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 3. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels.
    ucsc_seqlevels <- tolower(UCSC_seqlevels)
    ncbi_seqlevels <- tolower(NCBI_seqlevels)
    m <- match(ucsc_seqlevels[unmapped_idx], ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 4. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels after removal of the "chr" prefix on both sides.
    nochr_ucsc_seqlevels <- sub("^chr", "", ucsc_seqlevels[unmapped_idx])
    nochr_ncbi_seqlevels <- sub("^chr", "", ncbi_seqlevels)
    m <- match(nochr_ucsc_seqlevels, nochr_ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 5. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 6. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 7. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers after adding ".1" suffix to UCSC seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn,
                                             accn.suffix=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 8. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers after trimming the first part (e.g. "chr1_")
    ##    and "_random" suffix from the seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn,
                                             trim.seqlevels=TRUE)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 9. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after adding automatic suffix ".1" to the inferred
    ##    GenBank accession number if it's not versioned).
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 10. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after dropping the version from the GenBank accession
    ##    numbers provided by NCBI).
    NCBI_GenBankAccn0 <- sub("\\.[^.]*$", "", NCBI_GenBankAccn)
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn0)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 11. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels after adding AAD prefix to it.
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1",
                                        accn.prefix="AAD")
    L2R[unmapped_idx] <- m

    L2R
}

.add_NCBI_cols_to_UCSC_chrom_info <- function(
    UCSC_chrom_info, assembly_accession,
    AssemblyUnits=NULL, special_mappings=NULL, unmapped_seqs=NULL,
    drop_unmapped=FALSE)
{
    UCSC_seqlevels <- UCSC_chrom_info[ , "chrom"]

    if (length(unmapped_seqs) != 0L) {
        unmapped_seqs_role <- rep.int(names(unmapped_seqs),
                                      lengths(unmapped_seqs))
        unmapped_seqs <- unlist(unmapped_seqs, use.names=FALSE)
        unmapped_idx <- match(unmapped_seqs, UCSC_seqlevels)
        stopifnot(!anyNA(unmapped_idx))
    }

    NCBI_chrom_info <- getChromInfoFromNCBI(assembly_accession,
                                            assembly.units=AssemblyUnits)

    NCBI_seqlevels     <- NCBI_chrom_info[ , "SequenceName"]
    NCBI_UCSCStyleName <- NCBI_chrom_info[ , "UCSCStyleName"]
    NCBI_GenBankAccn   <- NCBI_chrom_info[ , "GenBankAccn"]
    NCBI_RefSeqAccn    <- NCBI_chrom_info[ , "RefSeqAccn"]
    L2R <- .map_UCSC_seqlevels_to_NCBI_seqlevels(
                                       UCSC_seqlevels,
                                       NCBI_seqlevels,
                                       NCBI_UCSCStyleName,
                                       NCBI_GenBankAccn,
                                       NCBI_RefSeqAccn,
                                       special_mappings=special_mappings)
    if (length(unmapped_seqs) != 0L)
        L2R[unmapped_idx] <- NA_integer_
    L2R_is_NA <- is.na(L2R)
    mapped_idx <- which(!L2R_is_NA)

    if (isTRUE(drop_unmapped)) {
        UCSC_chrom_info <-
            S4Vectors:::extract_data_frame_rows(UCSC_chrom_info, mapped_idx)
        L2R <- L2R[mapped_idx]
        mapped_idx <- seq_along(L2R)
        ## Before we drop the unmapped UCSC seqlevels, we want to make sure
        ## that all the NCBI seqlevels are reverse mapped. For example, in
        ## the case of hg38, the chromInfo table at UCSC contains sequences
        ## that belong to GRCh38.p13 but not to GRCh38. So we want to make
        ## sure that after we drop these "foreign sequences", we are left
        ## with a one-to-one mapping between UCSC seqlevels and the 455 NCBI
        ## seqlevels that are in the assembly report for GRCh38.
        idx <- setdiff(seq_along(NCBI_seqlevels), L2R)
        if (length(idx) != 0L) {
            in1string <- paste0(NCBI_seqlevels[idx], collapse=", ")
            stop(wmsg("no UCSC seqlevel could be mapped to the following ",
                      "NCBI seqlevel(s): ", in1string))
        }
    } else {
        unexpectedly_unmapped_idx <-
            which(L2R_is_NA & !(UCSC_seqlevels %in% unmapped_seqs))
        if (length(unexpectedly_unmapped_idx) != 0L) {
            in1string <- paste0(UCSC_seqlevels[unexpectedly_unmapped_idx],
                                collapse=", ")
            stop(wmsg("cannot map the following UCSC seqlevel(s) to an ",
                      "NCBI seqlevel: ", in1string))
        }
    }

    NCBI_chrom_info <- S4Vectors:::extract_data_frame_rows(NCBI_chrom_info, L2R)

    ## NCBI columns "circular", "SequenceLength", and "UCSCStyleName"
    ## are expected to be redundant with UCSC columns "circular", "size",
    ## and "chrom", respectively. Let's make sure these 3 NCBI columns
    ## agree with their UCSC counterpart before we drop them.

    stopifnot(identical(UCSC_chrom_info[mapped_idx, "circular"],
                        NCBI_chrom_info[mapped_idx, "circular"]))

    compare_idx <- which(!is.na(NCBI_chrom_info[ , "SequenceLength"]))
    stopifnot(identical(UCSC_chrom_info[compare_idx, "size"],
                        NCBI_chrom_info[compare_idx, "SequenceLength"]))

    ## According to the UCSC-style-name column of the assembly report for
    ## GRCh37.p13 (accession GCF_000001405.25), MT in GRCh37.p13 is mapped
    ## to chrM in hg19, which is wrong! So we skip this sanity check for
    ## GCF_000001405.25.
    if (assembly_accession != "GCF_000001405.25") {
        compare_idx <- which(!is.na(NCBI_chrom_info[ , "UCSCStyleName"]))
        x <- UCSC_chrom_info[compare_idx, "chrom"]
        y <- NCBI_chrom_info[compare_idx, "UCSCStyleName"]
        ## See https://github.com/Bioconductor/GenomeInfoDb/issues/27#issuecomment-871639449
        if (assembly_accession == "GCF_000001635.26") {
            i1 <- match("chr9_KB469738_fix", x)
            i2 <- match("chr9_KB469738v3_fix", y)
            stopifnot(isFALSE(is.na(i1)),
                      isFALSE(is.na(i2)),
                      identical(i1, i2))
            x <- x[-i1]
            y <- y[-i2]
        }
        stopifnot(identical(x, y))
    }

    drop_columns <- c("SequenceLength", "UCSCStyleName", "circular")
    NCBI_chrom_info <- drop_cols(NCBI_chrom_info, drop_columns)
    colnames(NCBI_chrom_info) <- paste0("NCBI.", colnames(NCBI_chrom_info))
    ans <- cbind(UCSC_chrom_info, NCBI_chrom_info)

    if (length(unmapped_seqs) != 0L)
        ans[unmapped_idx, "NCBI.SequenceRole"] <- unmapped_seqs_role

    stopifnot(!is.unsorted(ans[ , "NCBI.SequenceRole"]))
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_ensembl_column()
###

.fetch_ucscToEnsembl_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ucsc="character", ensembl="character")
    fetch_table_dump_from_UCSC(genome, "ucscToEnsembl",
                     col2class=col2class,
                     goldenPath.url=goldenPath.url)
}

### Filters and reformats the chromAlias data to so it's returned in the
### same format as the ucscToEnsembl table (i.e. 2-column data frame with
### columns "ucsc" and "ensembl").
.fetch_ucsc2ensembl_from_chromAlias_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ensembl="character", ucsc="character", source="factor")
    chromAlias <- fetch_table_dump_from_UCSC(genome, "chromAlias",
                                   col2class=col2class,
                                   goldenPath.url=goldenPath.url)
    ## Filters and reformats.
    keep_idx <- grep("ensembl", chromAlias[ , "source"], fixed=TRUE)
    ucsc2ensembl <- chromAlias[keep_idx, c("ucsc", "ensembl")]

    ## In some **very rare** situations, it seems that the chromAlias table
    ## can contain ambiguous ucsc-to-ensembl mappings where a given UCSC
    ## chromosome name is mapped to more than one Ensembl name. For example
    ## this happens for rn6 where unplaced scaffold chrUn_AABR07022993v1 is
    ## mapped to Ensembl names AABR07022518.1, AABR07023006.1, AABR07022993.1.
    ## Note that as of Jan 2020, the chromAlias table for rn6 is the only
    ## known case of ambiguous ucsc-to-ensembl mapping.
    if (anyDuplicated(ucsc2ensembl[ , "ucsc"])) {
        ## The disambiguation algorithm below assumes that the ambiguous
        ## ucsc-to-ensembl mappings are of the form xxYYY-to-ZZZ where
        ## ZZZ is a GenBank accession, YYY a GenBank accession where the
        ## dot (".") has been replaced with a "v" and xx can be any suffix.
        ## To disambiguate, we only keep mappings where YYY and ZZZ are the
        ## same (modulo the dot/v substitution).
        idx <- which(duplicated(ucsc2ensembl[ , "ucsc"]) |
                     duplicated(ucsc2ensembl[ , "ucsc"], fromLast=TRUE))
        ensembl <- chartr(".", "v", ucsc2ensembl[idx , "ensembl"])
        ucsc <- ucsc2ensembl[idx , "ucsc"]
        ucsc_nc <- nchar(ucsc)
        ucsc_suffix <- substr(ucsc, ucsc_nc - nchar(ensembl) + 1L, ucsc_nc)
        ok <- ucsc_suffix == ensembl
        if (sum(ok) == 0L)
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (!all(ucsc %in% ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (anyDuplicated(ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        drop_idx <- idx[!ok]
        ucsc2ensembl <- ucsc2ensembl[-drop_idx, , drop=FALSE]
    }
    ucsc2ensembl
}

.try_to_fetch_ucscToEnsembl <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'ucscToEnsembl' ... ", appendLF=FALSE)
    ucscToEnsembl <- try(
        suppressWarnings(
            .fetch_ucscToEnsembl_table(genome, goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucscToEnsembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    message("OK")
    ucscToEnsembl
}

.try_to_fetch_ucsc2ensembl_from_chromAlias <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'chromAlias' ... ", appendLF=FALSE)
    ucsc2ensembl <- try(
        suppressWarnings(
            .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucsc2ensembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    if (nrow(ucsc2ensembl) == 0L) {
        message("FAILED! (table exists but has no Ensembl information)")
        return(NULL)
    }
    message("OK")
    ucsc2ensembl
}

.fetch_Ensembl_column <- function(UCSC_chroms, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    if (identical(table, "ucscToEnsembl")) {
        ucsc2ensembl <- .fetch_ucscToEnsembl_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, "chromAlias")) {
        ucsc2ensembl <- .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, NA)) {
        table <- "ucscToEnsembl"
        ucsc2ensembl <- .try_to_fetch_ucscToEnsembl(genome,
                                      goldenPath.url=goldenPath.url)
        if (is.null(ucsc2ensembl)) {
            table <- "chromAlias"
            ucsc2ensembl <- .try_to_fetch_ucsc2ensembl_from_chromAlias(genome,
                                      goldenPath.url=goldenPath.url)
            if (is.null(ucsc2ensembl)) {
                warning(wmsg("No Ensembl sequence names could be ",
                             "found for UCSC genome ", genome))
                return(rep.int(NA_character_, length(UCSC_chroms)))
            }
        }
    } else {
        stop(wmsg("supplied 'table' is not supported"))
    }
    oo <- match(UCSC_chroms, ucsc2ensembl[ , "ucsc"])
    if (is.null(table) && anyNA(oo))
        warning(wmsg("incomplete data in '", table, "' table for ",
                     "UCSC genome ", genome, ": some UCSC sequence ",
                     "names are missing"))
    ucsc2ensembl[oo , "ensembl"]
}

.add_ensembl_column <- function(UCSC_chrom_info, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    ensembl <- .fetch_Ensembl_column(UCSC_chrom_info[ , "chrom"],
                                     genome, table=table,
                                     goldenPath.url=goldenPath.url)
    cbind(UCSC_chrom_info, ensembl=ensembl, stringsAsFactors=FALSE)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### registered_UCSC_genomes()
###

.parse_script_for_registered_UCSC_genome <- function(script_path)
{
    filename <- basename(script_path)
    if (substr(filename, nchar(filename)-1L, nchar(filename)) != ".R")
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "have extension .R"))
    if (grepl(" ", filename, fixed=TRUE))
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "not contain spaces"))

    ## Placeholders. Will actually get defined when we source the script.
    ## See README.TXT in inst/registered/UCSC_genomes/ for the list of
    ## variables.
    GENOME <- ORGANISM <- ASSEMBLED_MOLECULES <- CIRC_SEQS <- NULL
    FETCH_ORDERED_CHROM_SIZES <- NCBI_LINKER <- ENSEMBL_LINKER <- NULL
    source(script_path, local=TRUE)

    stop_if <- function(notok, ...) {
        if (notok)
            stop("Error in UCSC genome registration file '", filename,
                 "':\n  ", wmsg(...))
    }

    ## Check GENOME.
    stop_if(is.null(GENOME), "'GENOME' must be defined")
    stop_if(!isSingleString(GENOME), "'GENOME' must be a single string")
    target <- substr(filename, 1L, nchar(filename)-2L)
    stop_if(!identical(target, GENOME), "'GENOME' must match filename")

    ## Check ORGANISM.
    stop_if(is.null(ORGANISM), "'ORGANISM' must be defined")
    stop_if(!isSingleString(ORGANISM), "'ORGANISM' must be a single string")
    stop_if(grepl("_", ORGANISM, fixed=TRUE),
            "underscores are not allowed in 'ORGANISM' (use spaces instead)")

    ## Check ASSEMBLED_MOLECULES.
    stop_if(is.null(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be defined")
    stop_if(!is.character(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be a character vector")
    stop_if(!is_primary_key(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must ",
            "not contain NAs, empty strings, or duplicates")

    ## Check CIRC_SEQS.
    stop_if(is.null(CIRC_SEQS),
            "'CIRC_SEQS' must be defined")
    stop_if(!is.character(CIRC_SEQS),
            "'CIRC_SEQS' must be a character vector")
    stop_if(anyDuplicated(CIRC_SEQS),
            "'CIRC_SEQS' must not contain duplicates")
    stop_if(!all(CIRC_SEQS %in% ASSEMBLED_MOLECULES),
            "'CIRC_SEQS' must be a subset of 'ASSEMBLED_MOLECULES'")

    ## Check FETCH_ORDERED_CHROM_SIZES.
    if (!is.null(FETCH_ORDERED_CHROM_SIZES)) {
        stop_if(!is.function(FETCH_ORDERED_CHROM_SIZES),
                "when defined, 'FETCH_ORDERED_CHROM_SIZES' must be a function")
    }

    ## Check NCBI_LINKER.
    if (!is.null(NCBI_LINKER)) {
        stop_if(!is.list(NCBI_LINKER),
                "when defined, 'NCBI_LINKER' must be a named list")
        linker_fields <- names(NCBI_LINKER)
        stop_if(!is.character(linker_fields),
                "when defined, 'NCBI_LINKER' must be a named list")
        stop_if(!is_primary_key(linker_fields),
                "the names on 'NCBI_LINKER' must ",
                "not contain NAs, empty strings, or duplicates")
        stop_if(!("assembly_accession" %in% linker_fields),
                "'NCBI_LINKER' must have field \"assembly_accession\"")
        accession <- NCBI_LINKER$assembly_accession
        stop_if(!isSingleString(accession) || accession == "",
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be a single non-empty string")
        NCBI_assembly_info <- find_NCBI_assembly_info_for_accession(accession)
        stop_if(is.null(NCBI_assembly_info),
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be associated with a registered NCBI assembly")
        stop_if(!identical(ORGANISM, NCBI_assembly_info$organism),
                "the NCBI assembly associated with the \"assembly_accession\" ",
                "field in 'NCBI_LINKER' is registered for an organism ",
                "(\"", NCBI_assembly_info$organism, "\") that differs ",
                "from 'ORGANISM' (\"", ORGANISM, "\")")
    }

    ## Check ENSEMBL_LINKER.
    if (!is.null(ENSEMBL_LINKER)) {
        stop_if(!isSingleString(ENSEMBL_LINKER) || ENSEMBL_LINKER == "",
                "when defined, 'ENSEMBL_LINKER' must ",
                "be a single non-empty string")
    }

    list(GENOME=GENOME,
         ORGANISM=ORGANISM,
         ASSEMBLED_MOLECULES=ASSEMBLED_MOLECULES,
         CIRC_SEQS=CIRC_SEQS,
         FETCH_ORDERED_CHROM_SIZES=FETCH_ORDERED_CHROM_SIZES,
         NCBI_LINKER=NCBI_LINKER,
         ENSEMBL_LINKER=ENSEMBL_LINKER)
}

registered_UCSC_genomes <- function(organism=NA)
{
    if (!isSingleStringOrNA(organism))
        stop(wmsg("'organism' must be a single string or NA"))
    dir_path <- system.file("registered", "UCSC_genomes",
                             package="GenomeInfoDb")
    file_paths <- list.files(dir_path, pattern="\\.R$", full.names=TRUE)
    assemblies <- lapply(file_paths, .parse_script_for_registered_UCSC_genome)

    colnames <- c("organism", "genome", "NCBI_assembly",
                  "assembly_accession", "with_Ensembl", "circ_seqs")
    make_col <- function(j) {
        colname <- colnames[[j]]
        if (colname == "NCBI_assembly") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    accession <- linker$assembly_accession
                    find_NCBI_assembly_info_for_accession(accession)$assembly
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "assembly_accession") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    linker$assembly_accession
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "with_Ensembl") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$ENSEMBL_LINKER
                    !is.null(linker)
                }, logical(1), USE.NAMES=FALSE)
            return(col)
        }
        COLNAME <- toupper(colname)
        col0 <- lapply(assemblies, `[[`, COLNAME)
        if (colname == "circ_seqs")
            return(CharacterList(col0))
        stopifnot(all(lengths(col0) == 1L))
        col <- as.character(unlist(col0, use.names=FALSE))
        if (colname == "organism")
            col <- factor(col)  # order of levels will dictate order
                                # of rows in final DataFrame
        col
    }

    listData <- lapply(setNames(seq_along(colnames), colnames), make_col)
    DF <- S4Vectors:::new_DataFrame(listData, nrows=length(assemblies))
    oo <- UCSC.utils:::order_organism_genome_pairs(DF$organism, DF$genome)
    DF <- DF[oo, , drop=FALSE]
    if (!is.na(organism)) {
        keep_idx <- grep(organism, DF$organism, ignore.case=TRUE)
        DF <- DF[keep_idx, , drop=FALSE]
    }
    as.data.frame(DF)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getChromInfoFromUCSC()
###

.UCSC_cached_chrom_info <- new.env(parent=emptyenv())

.get_chrom_info_for_unregistered_UCSC_genome <- function(genome,
    assembled.molecules.only=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    ans <- .UCSC_cached_chrom_info[[genome]]
    if (is.null(ans) || recache) {
        ans <- try(suppressWarnings(
                       fetch_chrom_sizes_from_UCSC(genome,
                           goldenPath.url=goldenPath.url)),
                   silent=TRUE)
        if (inherits(ans, "try-error"))
            stop(wmsg("unknown UCSC genome: ", genome))
        oo <- orderSeqlevels(ans[ , "chrom"])
        ans <- S4Vectors:::extract_data_frame_rows(ans, oo)

        ## Add columns "assembled" and "circular".
        assembled <- rep.int(NA, nrow(ans))
        circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"])
        ans <- cbind(ans, assembled=assembled, circular=circular)

        .UCSC_cached_chrom_info[[genome]] <- ans
    }
    if (isTRUE(add.ensembl.col)) {
        warning(wmsg("'add.ensembl.col' got ignored for unregistered ",
                     "UCSC genome ", genome, "."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    } else if (identical(add.ensembl.col, "force")) {
        ans <- .add_ensembl_column(ans, genome,
                                   goldenPath.url=goldenPath.url)
    }
    if (assembled.molecules.only)
        warning(wmsg("'assembled.molecules' got ignored for unregistered ",
                     "UCSC genome ", genome, " (don't know what the ",
                     "assembled molecules are for an unregistered UCSC ",
                     "genome)"))
    ans
}

### Return NULL or a "raw" chrom info data.frame, that is, a 4-column
### data.frame with columns "chrom" (character), "size" (integer),
### "assembled" (logical), and "circular" (logical).
### The returned data.frame must have one row per assembled molecule.
.load_stored_amol_info_for_UCSC_genome <-
    function(GENOME, ASSEMBLED_MOLECULES, CIRC_SEQS)
{
    filename <- paste0(GENOME, ".tab")
    filepath <- system.file("extdata", "assembled_molecules_db", "UCSC",
                            filename, package="GenomeInfoDb")
    if (identical(filepath, ""))
        return(NULL)
    ans <- read_UCSC_assembled_molecules_db_table(filepath)
    expected_circular <- make_circ_flags_from_circ_seqs(ASSEMBLED_MOLECULES,
                                                        CIRC_SEQS)
    stopifnot(
        identical(sapply(ans, class),
                  c(chrom="character", size="integer", circular="logical")),
        identical(ans[ , "chrom"], ASSEMBLED_MOLECULES),
        identical(ans[ , "circular"], expected_circular)
    )

    ## Add "assembled" column.
    cbind(ans[ , c("chrom", "size")],
          assembled=rep.int(TRUE, nrow(ans)),
          circular=ans[ , "circular"])
}

### Return a "raw" chrom info data.frame (i.e. 4 core columns only, see above).
.fetch_raw_chrom_info_from_UCSC <- function(GENOME,
    ASSEMBLED_MOLECULES,
    CIRC_SEQS,
    FETCH_ORDERED_CHROM_SIZES=NULL,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    nb_assembled <- length(ASSEMBLED_MOLECULES)

    ## Obtain chrom sizes from UCSC.
    if (is.null(FETCH_ORDERED_CHROM_SIZES)) {
        chrom_sizes <- fetch_chrom_sizes_from_UCSC(GENOME,
                             goldenPath.url=goldenPath.url)
        stopifnot(nrow(chrom_sizes) == nb_assembled)
        oo <- match(ASSEMBLED_MOLECULES, chrom_sizes[ , "chrom"])
        stopifnot(!anyNA(oo))
        chrom_sizes <- S4Vectors:::extract_data_frame_rows(chrom_sizes, oo)
    } else {
        chrom_sizes <- FETCH_ORDERED_CHROM_SIZES(goldenPath.url=goldenPath.url)
        stopifnot(is.data.frame(chrom_sizes),
                  identical(sapply(chrom_sizes, class),
                            c(chrom="character", size="integer")),
                  identical(head(chrom_sizes[ , "chrom"], n=nb_assembled),
                            ASSEMBLED_MOLECULES),
                  is_primary_key(chrom_sizes[ , "chrom"]))
    }

    ## Prepare "assembled" column.
    assembled <- logical(nrow(chrom_sizes))
    assembled[seq_len(nb_assembled)] <- TRUE

    ## Prepare "circular" column.
    circular <- make_circ_flags_from_circ_seqs(chrom_sizes[ , "chrom"],
                                               CIRC_SEQS)

    ## Add "assembled" and "circular" columns.
    cbind(chrom_sizes, assembled=assembled, circular=circular)
}

### Return a "raw" chrom info data.frame (i.e. 4 core columns only, see above).
.get_raw_chrom_info_for_registered_UCSC_genome <- function(GENOME,
    ASSEMBLED_MOLECULES,
    CIRC_SEQS,
    FETCH_ORDERED_CHROM_SIZES=NULL,
    assembled.molecules.only=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    stored_amol_info <- .load_stored_amol_info_for_UCSC_genome(GENOME,
                                          ASSEMBLED_MOLECULES, CIRC_SEQS)
    if (!is.null(stored_amol_info) && assembled.molecules.only && !recache)
        return(stored_amol_info)
    ans <- .UCSC_cached_chrom_info[[GENOME]]
    if (is.null(ans) || recache) {
        ans <- .fetch_raw_chrom_info_from_UCSC(GENOME, ASSEMBLED_MOLECULES,
                                      CIRC_SEQS, FETCH_ORDERED_CHROM_SIZES,
                                      goldenPath.url=goldenPath.url)
        if (!is.null(stored_amol_info)) {
            assembled_idx <- seq_along(ASSEMBLED_MOLECULES)
            latest_amol_info <- S4Vectors:::extract_data_frame_rows(ans,
                                                         assembled_idx)
            if (!identical(stored_amol_info, latest_amol_info))
                warning(wmsg("The chromosome info obtained from UCSC ",
                             "for genome ", GENOME, " no longer matches ",
                             "the records stored in the GenomeInfoDb ",
                             "package. Could it be that ", GENOME, " has ",
                             "changed?"))
        }
        .UCSC_cached_chrom_info[[GENOME]] <- ans
    }
    ans
}

.get_NCBI_linker <- function(map.NCBI, linker, GENOME)
{
    if (is.list(map.NCBI))
        return(map.NCBI)
    if (isFALSE(map.NCBI))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'map.NCBI' got ignored for registered ",
                     "UCSC genome ", GENOME, " (additional NCBI columns ",
                     "are only available for registered UCSC genomes ",
                     "based on an NCBI assembly)"))
    linker
}

.get_Ensembl_linker <- function(add.ensembl.col, linker, GENOME)
{
    if (identical(add.ensembl.col, "force"))
        return(NA)
    if (isFALSE(add.ensembl.col))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'add.ensembl.col' got ignored for registered ",
                     "UCSC genome ", GENOME, " (genome was registered ",
                     "with no Ensembl sequence names linked to it)."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    linker
}

.get_chrom_info_for_registered_UCSC_genome <- function(script_path,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    vars <- .parse_script_for_registered_UCSC_genome(script_path)
    GENOME <- vars$GENOME
    ASSEMBLED_MOLECULES <- vars$ASSEMBLED_MOLECULES

    ## Get "raw" chrom info (i.e. 4 core columns only).
    ans <- .get_raw_chrom_info_for_registered_UCSC_genome(GENOME,
                     ASSEMBLED_MOLECULES, vars$CIRC_SEQS,
                     FETCH_ORDERED_CHROM_SIZES=vars$FETCH_ORDERED_CHROM_SIZES,
                     assembled.molecules.only=assembled.molecules.only,
                     goldenPath.url=goldenPath.url,
                     recache=recache)

    ## Add NCBI cols.
    NCBI_linker <- .get_NCBI_linker(map.NCBI, vars$NCBI_LINKER, GENOME)
    if (!is.null(NCBI_linker))
        ans <- do.call(.add_NCBI_cols_to_UCSC_chrom_info,
                       c(list(ans), NCBI_linker))

    ## Add Ensembl col.
    Ensemb_linker <- .get_Ensembl_linker(add.ensembl.col, vars$ENSEMBL_LINKER,
                                         GENOME)
    if (!is.null(Ensemb_linker))
        ans <- .add_ensembl_column(ans, GENOME, table=Ensemb_linker,
                                   goldenPath.url=goldenPath.url)

    if (assembled.molecules.only) {
        assembled_idx <- seq_along(ASSEMBLED_MOLECULES)
        ans <- S4Vectors:::extract_data_frame_rows(ans, assembled_idx)
    }

    if (!is.null(NCBI_linker)) {
        assembly_accession <- NCBI_linker$assembly_accession
        NCBI_assembly_info <-
            find_NCBI_assembly_info_for_accession(assembly_accession)
        attr(ans, "NCBI_assembly_info") <- NCBI_assembly_info
    }
    ans
}

.make_Seqinfo_from_UCSC_chrominfo <- function(chrominfo, genome=NA)
{
    Seqinfo(seqnames=chrominfo[ , "chrom"],
            seqlengths=chrominfo[ , "size"],
            isCircular=chrominfo[ , "circular"],
            genome=genome)
}

### By default, return a "raw" chrom info data.frame, that is, a 4-column
### data.frame with columns "chrom" (character), "size" (integer),
### "assembled" (logical), and "circular" (logical).
getChromInfoFromUCSC <- function(genome,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE,
    as.Seqinfo=FALSE)
{
    if (!isSingleString(genome))
        stop(wmsg("'genome' must be a single string"))
    if (!isTRUEorFALSE(assembled.molecules.only))
        stop(wmsg("'assembled.molecules.only' must be TRUE or FALSE"))
    if (!(isTRUEorFALSE(map.NCBI) ||
          is.list(map.NCBI) && !is.null(names(map.NCBI))))
        stop(wmsg("'map.NCBI' must be TRUE or FALSE ",
                  "(experts can also supply an \"NCBI linker\" ",
                  "as a named list)"))
    if (!(isTRUEorFALSE(add.ensembl.col) ||
          identical(add.ensembl.col, "force")))
        stop(wmsg("'add.ensembl.col' must be TRUE or FALSE ",
                  "(experts can also use \"force\")"))
    if (!isSingleString(goldenPath.url))
        stop(wmsg("'goldenPath.url' must be a single string"))
    if (!isTRUEorFALSE(recache))
        stop(wmsg("'recache' must be TRUE or FALSE"))
    if (!isTRUEorFALSE(as.Seqinfo))
        stop(wmsg("'as.Seqinfo' must be TRUE or FALSE"))

    script_name <- paste0(genome, ".R")
    script_path <- system.file("registered", "UCSC_genomes", script_name,
                               package="GenomeInfoDb")

    if (identical(script_path, "")) {
        if (!isFALSE(map.NCBI))
            warning(wmsg("'map.NCBI' got ignored for unregistered ",
                         "UCSC genome ", genome, " (additional NCBI columns ",
                         "are only available for registered UCSC genomes ",
                         "based on an NCBI assembly)"))
        ans <- .get_chrom_info_for_unregistered_UCSC_genome(genome,
                    assembled.molecules.only=assembled.molecules.only,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    } else {
        ans <- .get_chrom_info_for_registered_UCSC_genome(script_path,
                    assembled.molecules.only=assembled.molecules.only,
                    map.NCBI=map.NCBI,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    }

    if (as.Seqinfo)
        ans <- .make_Seqinfo_from_UCSC_chrominfo(ans, genome=genome)
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### get_and_fix_chrom_info_from_UCSC()
###
### A wrapper to getChromInfoFromUCSC() that tries to improve handling of
### non-registered genomes by:
### - ordering the sequences based on the heuristic implemented in
###   rankSeqlevels()/orderSeqlevels()
### - warning the user about the circularity flags being just a guess
###   if the caller doesn't supply 'circ_seqs'
### To use in higher-level functions like makeTxDbFromUCSC() in the
### GenomicFeatures package.

.warn_about_circularity_guess <- function(genome, circ_seqs,
                                          warning_tip1="", warning_tip2="")
{
    if (length(circ_seqs) == 0L) {
        guess <- c("no sequence in ", genome , " is circular")
    } else if (length(circ_seqs) == 1L) {
        guess <- c("sequence ", circ_seqs, " is circular")
    } else {
        in1string <- paste(circ_seqs, collapse=", ")
        guess <- c(length(circ_seqs), " sequences (",
                   in1string, ") are circular")
    }
    warning(wmsg("UCSC genome ", genome, " is not registered in the ",
                 "GenomeInfoDb package (see '?registered_UCSC_genomes'), ",
                 "so we made the following guess about this genome: ",
                 guess, ". ", warning_tip1),
            "\n  ",
            wmsg("If you believe that this guess is incorrect, please ",
                 "let us know by opening an issue at:"),
            "\n      https://github.com/Bioconductor/GenomeInfoDb\n  ",
            wmsg("In the meantime, you can supply your own list of circular ",
                 "sequences (as a character vector) via the 'circ_seqs' ",
                 "argument. ", warning_tip2))
}

### 'warning_tip1' and 'warning_tip2' must be single strings. They will be
### added to the warning that we will issue if getChromInfoFromUCSC() had
### to guess the circularity flags.
get_and_fix_chrom_info_from_UCSC <- function(genome,
    ...,
    as.Seqinfo=FALSE,
    circ_seqs=NULL,
    warning_tip1="",
    warning_tip2="")
{
    if (!isSingleString(genome))
        stop(wmsg("'genome' must be a single string"))
    if (!isTRUEorFALSE(as.Seqinfo))
        stop(wmsg("'as.Seqinfo' must be TRUE or FALSE"))
    if (!isSingleString(warning_tip1))
        stop(wmsg("'warning_tip1' must be a single string"))
    if (!isSingleString(warning_tip2))
        stop(wmsg("'warning_tip2' must be a single string"))

    UCSC_genomes <- registered_UCSC_genomes()[ , "genome"]
    is_registered <- genome %in% UCSC_genomes

    ans <- getChromInfoFromUCSC(genome, ..., as.Seqinfo=FALSE)
    if (!is_registered) {
        oo <- orderSeqlevels(ans[ , "chrom"])
        ans <- S4Vectors:::extract_data_frame_rows(ans, oo)
    }
    seqlevels <- ans[ , "chrom"]
    if (!is.null(circ_seqs)) {
        ## The user supplied 'circ_seqs', so we use their input to infer the
        ## circularity flags.
        ## Note that the user would typically supply 'circ_seqs' when the
        ## genome is NOT registered. In the case of a registered genome,
        ## they don't need to do this, because, in this case, the circularity
        ## flags returned by getChromInfoFromUCSC() should be accurate (they
        ## were manually curated). However, if the user does supply the
        ## 'circ_seqs' for a registered genome, we assume that they know
        ## what they are doing, so we still use their input to replace the
        ## circularity flags returned by getChromInfoFromUCSC().
        ans[ , "circular"] <- make_circ_flags_from_circ_seqs(
                                              seqlevels,
                                              circ_seqs=circ_seqs)
    } else if (!is_registered) {
        ## The user did NOT supply 'circ_seqs'. And also because 'genome'
        ## is NOT a registered UCSC genome, getChromInfoFromUCSC() had to
        ## guess the circularity flags that it returned in 'ans'. Note
        ## that this uncertainty about the circularity flags is reflected
        ## in the presence of NA's instead of FALSE's in 'ans[ , "circular"]'.
        ## In addition to those NA's (which are easy to miss), we also warn
        ## the user about this guess with an explicit warning.
        circ_seqs <- seqlevels[which(ans[ , "circular"])]
        .warn_about_circularity_guess(genome, circ_seqs,
                                      warning_tip1, warning_tip2)
    }

    if (as.Seqinfo)
        ans <- .make_Seqinfo_from_UCSC_chrominfo(ans, genome=genome)
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### saveAssembledMoleculesInfoFromUCSC()
###
### Not intended for the end user.
### Use case is to add "assembled molecules info" for the specified UCSC
### genomes to the GenomeInfoDb package. The genomes must be **registered**.
### See README.TXT in GenomeInfoDb/inst/extdata/assembled_molecules_db/UCSC/
### for more information.

### Vectorized.
saveAssembledMoleculesInfoFromUCSC <- function(
    genomes, dir=".",
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    if (!is.character(genomes))
        stop(wmsg("'genomes' must be a character vector"))
    if (!isSingleString(dir))
        stop(wmsg("'dir' must be a single string specifying the path ",
                  "to the directory where to save the RDS file"))
    expected_colnames <- c("chrom", "size", "assembled", "circular")
    for (genome in genomes) {
        chrominfo <- getChromInfoFromUCSC(genome, assembled.molecules.only=TRUE,
                                          goldenPath.url=goldenPath.url,
                                          recache=TRUE)
        stopifnot(identical(colnames(chrominfo), expected_colnames))
        chrominfo <- chrominfo[ , -3L]
        filename <- paste0(genome, ".tab")
        filepath <- file.path(dir, filename)
        write.table(chrominfo, file=filepath,
                    quote=FALSE, sep="\t", row.names=FALSE)
    }
}
Bioconductor/GenomeInfoDb documentation built on May 24, 2024, 11:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Bioconductor/GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R
In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in get_and_fix_chrom_info_from_UCSC getChromInfoFromUCSC registered_UCSC_genomes saveAssembledMoleculesInfoFromUCSC

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/GenomeInfoDb Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in get_and_fix_chrom_info_from_UCSC getChromInfoFromUCSC registered_UCSC_genomes saveAssembledMoleculesInfoFromUCSC

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R
In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style