R/getChromInfoFromUCSC.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in getChromInfoFromUCSC registered_UCSC_genomes

### =========================================================================
### getChromInfoFromUCSC()
### -------------------------------------------------------------------------
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_NCBI_cols_to_UCSC_chrom_info()
###

.match_UCSC_seqlevels_to_NCBI_accns <- function(UCSC_seqlevels, NCBI_accns,
                                                trim.seqlevels=FALSE,
                                                accn.suffix="")
{
    if (trim.seqlevels) {
        UCSC_seqlevels <- sub("_(alt|random|fix)$", "", UCSC_seqlevels)
        UCSC_seqlevels <- sub("^[^_]*_", "", UCSC_seqlevels)
    }
    x <- chartr("v-", "..", UCSC_seqlevels)
    x <- paste0(x, accn.suffix)
    names(x) <- UCSC_seqlevels
    solid_match2(tolower(x), tolower(NCBI_accns),
                 x_what="UCSC seqlevel", table_what="accession number")
}

.match_UCSC_seqlevels_part2_to_NCBI_accns <-
    function(UCSC_seqlevels, NCBI_accns, part2.auto.version="", accn.prefix="")
{
    ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
    seqlevel_parts <- strsplit(UCSC_seqlevels, "_")
    nparts <- lengths(seqlevel_parts)
    idx2 <- which(nparts >= 2L)
    if (length(idx2) == 0L)
        return(ans)
    offsets <- c(0L, cumsum(nparts[idx2[-length(idx2)]]))
    x <- unlist(seqlevel_parts[idx2], use.names=FALSE)[offsets + 2L]
    x <- chartr("v", ".", x)
    unversioned_idx <- grep(".", x, fixed=TRUE, invert=TRUE)
    if (length(unversioned_idx) != 0L)
        x[unversioned_idx] <- paste0(x[unversioned_idx], part2.auto.version)
    x <- paste0(accn.prefix, x)
    names(x) <- UCSC_seqlevels[idx2]
    ans[idx2] <- solid_match2(tolower(x), tolower(NCBI_accns),
                              x_what="UCSC seqlevel",
                              table_what="accession number")
    ans
}

### The workhorse behind .add_NCBI_cols_to_UCSC_chrom_info().
### - All input vectors must be character vectors.
### - All NCBI input vectors must have the same length.
### - Vectors 'UCSC_seqlevels' and 'NCBI_seqlevels' must be "primary
###   keys" i.e. must not contain NAs, empty strings, or duplicates.
###   (No such assumption is made about the other input vectors.)
### Returns an integer vector parallel to 'UCSC_seqlevels'.
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
                                                  NCBI_seqlevels,
                                                  NCBI_UCSCStyleName,
                                                  NCBI_GenBankAccn,
                                                  NCBI_RefSeqAccn,
                                                  special_mappings=NULL)
{
    stopifnot(is.character(UCSC_seqlevels),
              is_primary_key(UCSC_seqlevels),
              is.character(NCBI_seqlevels),
              is_primary_key(NCBI_seqlevels),
              is.character(NCBI_UCSCStyleName),
              is.character(NCBI_GenBankAccn),
              is.character(NCBI_RefSeqAccn))

    L2R <- rep.int(NA_integer_, length(UCSC_seqlevels))
    unmapped_idx <- seq_along(L2R)

    ## 1. Handle special mappings.
    if (!is.null(special_mappings)) {
        m1 <- match(names(special_mappings), UCSC_seqlevels)
        if (anyNA(m1))
            stop(wmsg("'names(special_mappings)' contains sequence ",
                      "names not found in the UCSC genome"))
        m2 <- match(special_mappings, NCBI_seqlevels)
        if (anyNA(m2))
            stop(wmsg("'special_mappings' contains sequence ",
                      "names not found in the NCBI assembly"))
        L2R[m1] <- m2

        unmapped_idx <- which(is.na(L2R))
        if (length(unmapped_idx) == 0L)
            return(L2R)

        NCBI_UCSCStyleName[m2] <- NA_character_
    }

    ## 2. We assign based on exact match between 'UCSC_seqlevels'
    ##    and 'NCBI_UCSCStyleName'.
    m <- match(UCSC_seqlevels[unmapped_idx], NCBI_UCSCStyleName)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 3. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels.
    ucsc_seqlevels <- tolower(UCSC_seqlevels)
    ncbi_seqlevels <- tolower(NCBI_seqlevels)
    m <- match(ucsc_seqlevels[unmapped_idx], ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 4. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels after removal of the "chr" prefix on both sides.
    nochr_ucsc_seqlevels <- sub("^chr", "", ucsc_seqlevels[unmapped_idx])
    nochr_ncbi_seqlevels <- sub("^chr", "", ncbi_seqlevels)
    m <- match(nochr_ucsc_seqlevels, nochr_ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 5. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 6. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 7. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers after adding ".1" suffix to UCSC seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn,
                                             accn.suffix=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 8. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers after trimming the first part (e.g. "chr1_")
    ##    and "_random" suffix from the seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn,
                                             trim.seqlevels=TRUE)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 9. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after adding automatic suffix ".1" to the inferred
    ##    GenBank accession number if it's not versioned).
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 10. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after dropping the version from the GenBank accession
    ##    numbers provided by NCBI).
    NCBI_GenBankAccn0 <- sub("\\.[^.]*$", "", NCBI_GenBankAccn)
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn0)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 11. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels after adding AAD prefix to it.
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1",
                                        accn.prefix="AAD")
    L2R[unmapped_idx] <- m

    L2R
}

.add_NCBI_cols_to_UCSC_chrom_info <- function(
    UCSC_chrom_info, assembly_accession,
    AssemblyUnits=NULL, special_mappings=NULL, unmapped_seqs=NULL,
    drop_unmapped=FALSE)
{
    UCSC_seqlevels <- UCSC_chrom_info[ , "chrom"]

    if (length(unmapped_seqs) != 0L) {
        unmapped_seqs_role <- rep.int(names(unmapped_seqs),
                                      lengths(unmapped_seqs))
        unmapped_seqs <- unlist(unmapped_seqs, use.names=FALSE)
        unmapped_idx <- match(unmapped_seqs, UCSC_seqlevels)
        stopifnot(!anyNA(unmapped_idx))
    }

    NCBI_chrom_info <- getChromInfoFromNCBI(assembly_accession,
                                            assembly.units=AssemblyUnits)

    NCBI_seqlevels     <- NCBI_chrom_info[ , "SequenceName"]
    NCBI_UCSCStyleName <- NCBI_chrom_info[ , "UCSCStyleName"]
    NCBI_GenBankAccn   <- NCBI_chrom_info[ , "GenBankAccn"]
    NCBI_RefSeqAccn    <- NCBI_chrom_info[ , "RefSeqAccn"]
    L2R <- .map_UCSC_seqlevels_to_NCBI_seqlevels(
                                       UCSC_seqlevels,
                                       NCBI_seqlevels,
                                       NCBI_UCSCStyleName,
                                       NCBI_GenBankAccn,
                                       NCBI_RefSeqAccn,
                                       special_mappings=special_mappings)
    L2R_is_NA <- is.na(L2R)
    mapped_idx <- which(!L2R_is_NA)

    if (length(unmapped_seqs) != 0L)
        stopifnot(all(L2R_is_NA[unmapped_idx]))

    if (isTRUE(drop_unmapped)) {
        UCSC_chrom_info <-
            S4Vectors:::extract_data_frame_rows(UCSC_chrom_info, mapped_idx)
        L2R <- L2R[mapped_idx]
        mapped_idx <- seq_along(L2R)
        ## Before we drop the unmapped UCSC seqlevels, we want to make sure
        ## that all the NCBI seqlevels are reverse mapped. For example, in
        ## the case of hg38, the chromInfo table at UCSC contains sequences
        ## that belong to GRCh38.p12 but not to GRCh38. So we want to make
        ## sure that after we drop these "foreign sequences", we are left
        ## with a one-to-one mapping between UCSC seqlevels and the 455 NCBI
        ## seqlevels that are in the assembly report for GRCh38.
        idx <- setdiff(seq_along(NCBI_seqlevels), L2R)
        if (length(idx) != 0L) {
            in1string <- paste0(NCBI_seqlevels[idx], collapse=", ")
            stop(wmsg("no UCSC seqlevel could be mapped to the following ",
                      "NCBI seqlevel(s): ", in1string))
        }
    } else {
        unexpectedly_unmapped_idx <-
            which(L2R_is_NA & !(UCSC_seqlevels %in% unmapped_seqs))
        if (length(unexpectedly_unmapped_idx) != 0L) {
            in1string <- paste0(UCSC_seqlevels[unexpectedly_unmapped_idx],
                                collapse=", ")
            stop(wmsg("cannot map the following UCSC seqlevel(s) to an ",
                      "NCBI seqlevel: ", in1string))
        }
    }

    NCBI_chrom_info <- S4Vectors:::extract_data_frame_rows(NCBI_chrom_info, L2R)

    ## NCBI columns "circular", "SequenceLength", and "UCSCStyleName"
    ## are expected to be redundant with UCSC columns "circular", "size",
    ## and "chrom", respectively. Let's make sure these 3 NCBI columns
    ## agree with their UCSC counterpart before dropping them.

    stopifnot(identical(UCSC_chrom_info[mapped_idx, "circular"],
                        NCBI_chrom_info[mapped_idx, "circular"]))

    compare_idx <- which(!is.na(NCBI_chrom_info[ , "SequenceLength"]))
    stopifnot(identical(UCSC_chrom_info[compare_idx, "size"],
                        NCBI_chrom_info[compare_idx, "SequenceLength"]))

    ## According to the UCSC-style-name column of the assembly report for
    ## GRCh37.p13 (accession GCF_000001405.25), MT in GRCh37.p13 is mapped
    ## to chrM in hg19, which is wrong! So we skip this sanity check for
    ## GCF_000001405.25.
    if (assembly_accession != "GCF_000001405.25") {
        compare_idx <- which(!is.na(NCBI_chrom_info[ , "UCSCStyleName"]))
        stopifnot(identical(UCSC_chrom_info[compare_idx, "chrom"],
                            NCBI_chrom_info[compare_idx, "UCSCStyleName"]))
    }

    drop_columns <- c("SequenceLength", "UCSCStyleName", "circular")
    NCBI_chrom_info <- drop_cols(NCBI_chrom_info, drop_columns)
    colnames(NCBI_chrom_info) <- paste0("NCBI.", colnames(NCBI_chrom_info))
    ans <- cbind(UCSC_chrom_info, NCBI_chrom_info)

    if (length(unmapped_seqs) != 0L)
        ans[unmapped_idx, "NCBI.SequenceRole"] <- unmapped_seqs_role

    stopifnot(!is.unsorted(ans[ , "NCBI.SequenceRole"]))
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_ensembl_column()
###

.fetch_ucscToEnsembl_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ucsc="character", ensembl="character")
    fetch_table_from_UCSC(genome, "ucscToEnsembl",
                          col2class=col2class,
                          goldenPath.url=goldenPath.url)
}

### Filters and reformats the chromAlias data to so it's returned in the
### same format as the ucscToEnsembl table (i.e. 2-column data frame with
### columns "ucsc" and "ensembl").
.fetch_ucsc2ensembl_from_chromAlias_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ensembl="character", ucsc="character", source="factor")
    chromAlias <- fetch_table_from_UCSC(genome, "chromAlias",
                                        col2class=col2class,
                                        goldenPath.url=goldenPath.url)
    ## Filters and reformats.
    keep_idx <- grep("ensembl", chromAlias[ , "source"], fixed=TRUE)
    ucsc2ensembl <- chromAlias[keep_idx, c("ucsc", "ensembl")]

    ## In some **very rare** situations, it seems that the chromAlias table
    ## can contain ambiguous ucsc-to-ensembl mappings where a given UCSC
    ## chromosome name is mapped to more than one Ensembl name. For example
    ## this happens for rn6 where unplaced scaffold chrUn_AABR07022993v1 is
    ## mapped to Ensembl names AABR07022518.1, AABR07023006.1, AABR07022993.1.
    ## Note that as of Jan 2020, the chromAlias table for rn6 is the only
    ## known case of ambiguous ucsc-to-ensembl mapping.
    if (anyDuplicated(ucsc2ensembl[ , "ucsc"])) {
        ## The disambiguation algorithm below assumes that the ambiguous
        ## ucsc-to-ensembl mappings are of the form xxYYY-to-ZZZ where
        ## ZZZ is a GenBank accession, YYY a GenBank accession where the
        ## dot (".") has been replaced with a "v" and xx can be any suffix.
        ## To disambiguate, we only keep mappings where YYY and ZZZ are the
        ## same (modulo the dot/v substitution).
        idx <- which(duplicated(ucsc2ensembl[ , "ucsc"]) |
                     duplicated(ucsc2ensembl[ , "ucsc"], fromLast=TRUE))
        ensembl <- chartr(".", "v", ucsc2ensembl[idx , "ensembl"])
        ucsc <- ucsc2ensembl[idx , "ucsc"]
        ucsc_nc <- nchar(ucsc)
        ucsc_suffix <- substr(ucsc, ucsc_nc - nchar(ensembl) + 1L, ucsc_nc)
        ok <- ucsc_suffix == ensembl
        if (sum(ok) == 0L)
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (!all(ucsc %in% ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (anyDuplicated(ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        drop_idx <- idx[!ok]
        ucsc2ensembl <- ucsc2ensembl[-drop_idx, , drop=FALSE]
    }
    ucsc2ensembl
}

.try_to_fetch_ucscToEnsembl <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'ucscToEnsembl' ... ", appendLF=FALSE)
    ucscToEnsembl <- try(
        suppressWarnings(
            .fetch_ucscToEnsembl_table(genome, goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucscToEnsembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    message("OK")
    ucscToEnsembl
}

.try_to_fetch_ucsc2ensembl_from_chromAlias <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'chromAlias' ... ", appendLF=FALSE)
    ucsc2ensembl <- try(
        suppressWarnings(
            .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucsc2ensembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    if (nrow(ucsc2ensembl) == 0L) {
        message("FAILED! (table exists but has no Ensembl information)")
        return(NULL)
    }
    message("OK")
    ucsc2ensembl
}

.fetch_Ensembl_column <- function(UCSC_chroms, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    if (identical(table, "ucscToEnsembl")) {
        ucsc2ensembl <- .fetch_ucscToEnsembl_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, "chromAlias")) {
        ucsc2ensembl <- .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, NA)) {
        table <- "ucscToEnsembl"
        ucsc2ensembl <- .try_to_fetch_ucscToEnsembl(genome,
                                      goldenPath.url=goldenPath.url)
        if (is.null(ucsc2ensembl)) {
            table <- "chromAlias"
            ucsc2ensembl <- .try_to_fetch_ucsc2ensembl_from_chromAlias(genome,
                                      goldenPath.url=goldenPath.url)
            if (is.null(ucsc2ensembl)) {
                warning(wmsg("No Ensembl sequence names could be ",
                             "found for UCSC genome ", genome))
                return(rep.int(NA_character_, length(UCSC_chroms)))
            }
        }
    } else {
        stop(wmsg("supplied 'table' is not supported"))
    }
    oo <- match(UCSC_chroms, ucsc2ensembl[ , "ucsc"])
    if (is.null(table) && anyNA(oo))
        warning(wmsg("incomplete data in '", table, "' table for ",
                     "UCSC genome ", genome, ": some UCSC sequence ",
                     "names are missing"))
    ucsc2ensembl[oo , "ensembl"]
}

.add_ensembl_column <- function(UCSC_chrom_info, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    ensembl <- .fetch_Ensembl_column(UCSC_chrom_info[ , "chrom"],
                                     genome, table=table,
                                     goldenPath.url=goldenPath.url)
    cbind(UCSC_chrom_info, ensembl=ensembl, stringsAsFactors=FALSE)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### registered_UCSC_genomes()
###

.parse_script_for_registered_UCSC_genome <- function(script_path)
{
    filename <- basename(script_path)
    if (substr(filename, nchar(filename)-1L, nchar(filename)) != ".R")
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "have extension .R"))
    if (grepl(" ", filename, fixed=TRUE))
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "not contain spaces"))

    ## Placeholders. Will actually get defined when we source the script.
    ## See README.TXT in inst/registered/UCSC_genomes/ for the list of
    ## variables.
    GENOME <- ORGANISM <- ASSEMBLED_MOLECULES <- CIRC_SEQS <- NULL
    GET_CHROM_SIZES <- NCBI_LINKER <- ENSEMBL_LINKER <- NULL
    source(script_path, local=TRUE)

    stop_if <- function(notok, ...) {
        if (notok)
            stop("Error in UCSC genome registration file '", filename,
                 "':\n  ", wmsg(...))
    }

    ## Check GENOME.
    stop_if(is.null(GENOME), "'GENOME' must be defined")
    stop_if(!isSingleString(GENOME), "'GENOME' must be a single string")
    target <- substr(filename, 1L, nchar(filename)-2L)
    stop_if(!identical(target, GENOME), "'GENOME' must match filename")

    ## Check ORGANISM.
    stop_if(is.null(ORGANISM), "'ORGANISM' must be defined")
    stop_if(!isSingleString(ORGANISM), "'ORGANISM' must be a single string")
    stop_if(grepl("_", ORGANISM, fixed=TRUE),
            "underscores are not allowed in 'ORGANISM' (use spaces instead)")

    ## Check ASSEMBLED_MOLECULES.
    stop_if(is.null(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be defined")
    stop_if(!is.character(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be a character vector")
    stop_if(!is_primary_key(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must ",
            "not contain NAs, empty strings, or duplicates")

    ## Check CIRC_SEQS.
    stop_if(is.null(CIRC_SEQS),
            "'CIRC_SEQS' must be defined")
    stop_if(!is.character(CIRC_SEQS),
            "'CIRC_SEQS' must be a character vector")
    stop_if(anyDuplicated(CIRC_SEQS),
            "'CIRC_SEQS' must not contain duplicates")
    stop_if(!all(CIRC_SEQS %in% ASSEMBLED_MOLECULES),
            "'CIRC_SEQS' must be a subset of 'ASSEMBLED_MOLECULES'")

    ## Check GET_CHROM_SIZES.
    if (!is.null(GET_CHROM_SIZES)) {
        stop_if(!is.function(GET_CHROM_SIZES),
                "when defined, 'GET_CHROM_SIZES' must be a function")
    }

    ## Check NCBI_LINKER.
    if (!is.null(NCBI_LINKER)) {
        stop_if(!is.list(NCBI_LINKER),
                "when defined, 'NCBI_LINKER' must be a named list")
        linker_fields <- names(NCBI_LINKER)
        stop_if(!is.character(linker_fields),
                "when defined, 'NCBI_LINKER' must be a named list")
        stop_if(!is_primary_key(linker_fields),
                "the names on 'NCBI_LINKER' must ",
                "not contain NAs, empty strings, or duplicates")
        stop_if(!("assembly_accession" %in% linker_fields),
                "'NCBI_LINKER' must have field \"assembly_accession\"")
        accession <- NCBI_LINKER$assembly_accession
        stop_if(!isSingleString(accession) || accession == "",
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be a single non-empty string")
        NCBI_assembly_info <- find_NCBI_assembly_info_for_accession(accession)
        stop_if(is.null(NCBI_assembly_info),
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be associated with a registered NCBI assembly")
        stop_if(!identical(ORGANISM, NCBI_assembly_info$organism),
                "the NCBI assembly associated with the \"assembly_accession\" ",
                "field in 'NCBI_LINKER' is registered for an organism ",
                "(\"", NCBI_assembly_info$organism, "\") that differs ",
                "from 'ORGANISM' (\"", ORGANISM, "\")")
    }

    ## Check ENSEMBL_LINKER.
    if (!is.null(ENSEMBL_LINKER)) {
        stop_if(!isSingleString(ENSEMBL_LINKER) || ENSEMBL_LINKER == "",
                "when defined, 'ENSEMBL_LINKER' must ",
                "be a single non-empty string")
    }

    list(GENOME=GENOME,
         ORGANISM=ORGANISM,
         ASSEMBLED_MOLECULES=ASSEMBLED_MOLECULES,
         CIRC_SEQS=CIRC_SEQS,
         GET_CHROM_SIZES=GET_CHROM_SIZES,
         NCBI_LINKER=NCBI_LINKER,
         ENSEMBL_LINKER=ENSEMBL_LINKER)
}

registered_UCSC_genomes <- function()
{
    dir_path <- system.file("registered", "UCSC_genomes",
                             package="GenomeInfoDb")
    file_paths <- list.files(dir_path, pattern="\\.R$", full.names=TRUE)
    assemblies <- lapply(file_paths, .parse_script_for_registered_UCSC_genome)

    colnames <- c("organism", "genome", "NCBI_assembly",
                  "assembly_accession", "with_Ensembl", "circ_seqs")
    make_col <- function(j) {
        colname <- colnames[[j]]
        if (colname == "NCBI_assembly") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    accession <- linker$assembly_accession
                    find_NCBI_assembly_info_for_accession(accession)$assembly
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "assembly_accession") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    linker$assembly_accession
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "with_Ensembl") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$ENSEMBL_LINKER
                    !is.null(linker)
                }, logical(1), USE.NAMES=FALSE)
            return(col)
        }
        COLNAME <- toupper(colname)
        col0 <- lapply(assemblies, `[[`, COLNAME)
        if (colname == "circ_seqs")
            return(CharacterList(col0))
        stopifnot(all(lengths(col0) == 1L))
        col <- as.character(unlist(col0, use.names=FALSE))
        if (colname == "organism")
            col <- factor(col)  # order of levels will dictate order
                                # of rows in final DataFrame
        col
    }

    listData <- lapply(setNames(seq_along(colnames), colnames), make_col)
    ans <- S4Vectors:::new_DataFrame(listData, nrows=length(assemblies))
    genome_trailing_digits <- sub("(.*[^0-9])([0-9]*)$", "\\2", ans$genome)
    oo <- order(ans$organism, as.integer(genome_trailing_digits))
    as.data.frame(ans[oo, , drop=FALSE])
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getChromInfoFromUCSC()
###

.UCSC_cached_chrom_info <- new.env(parent=emptyenv())

.get_chrom_info_for_unregistered_UCSC_genome <- function(genome,
    assembled.molecules.only=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    ans <- .UCSC_cached_chrom_info[[genome]]
    if (is.null(ans) || recache) {
        ans <- try(suppressWarnings(
                       fetch_chrom_sizes_from_UCSC(genome,
                           goldenPath.url=goldenPath.url)),
                   silent=TRUE)
        if (inherits(ans, "try-error"))
            stop(wmsg("unknown UCSC genome: ", genome))
        oo <- orderSeqlevels(ans[ , "chrom"])
        ans <- S4Vectors:::extract_data_frame_rows(ans, oo)

        ## Add columns "assembled" and "circular".
        assembled <- rep.int(NA, nrow(ans))
        circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"])
        ans <- cbind(ans, assembled=assembled, circular=circular)

        .UCSC_cached_chrom_info[[genome]] <- ans
    }
    if (isTRUE(add.ensembl.col)) {
        warning(wmsg("'add.ensembl.col' got ignored for unregistered ",
                     "UCSC genome ", genome, "."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    } else if (identical(add.ensembl.col, "force")) {
        ans <- .add_ensembl_column(ans, genome,
                                   goldenPath.url=goldenPath.url)
    }
    if (assembled.molecules.only)
        warning(wmsg("'assembled.molecules' got ignored for unregistered ",
                     "UCSC genome ", genome, " (don't know what the ",
                     "assembled molecules are for an unregistered UCSC ",
                     "genome)"))
    ans
}

.get_NCBI_linker <- function(map.NCBI, linker, GENOME)
{
    if (is.list(map.NCBI))
        return(map.NCBI)
    if (isFALSE(map.NCBI))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'map.NCBI' got ignored for registered ",
                     "UCSC genome ", GENOME, " (additional NCBI columns ",
                     "are only available for registered UCSC genomes ",
                     "based on an NCBI assembly)"))
    linker
}

.get_Ensembl_linker <- function(add.ensembl.col, linker, GENOME)
{
    if (identical(add.ensembl.col, "force"))
        return(NA)
    if (isFALSE(add.ensembl.col))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'add.ensembl.col' got ignored for registered ",
                     "UCSC genome ", GENOME, " (genome was registered ",
                     "with no Ensembl sequence names linked to it)."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    linker
}

.get_chrom_info_for_registered_UCSC_genome <- function(script_path,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    vars <- .parse_script_for_registered_UCSC_genome(script_path)
    GENOME <- vars$GENOME
    ASSEMBLED_MOLECULES <- vars$ASSEMBLED_MOLECULES

    nb_assembled <- length(ASSEMBLED_MOLECULES)
    assembled_idx <- seq_len(nb_assembled)

    ans <- .UCSC_cached_chrom_info[[GENOME]]
    if (is.null(ans) || recache) {
        GET_CHROM_SIZES <- vars$GET_CHROM_SIZES
        if (is.null(GET_CHROM_SIZES)) {
            ans <- fetch_chrom_sizes_from_UCSC(GENOME,
                                               goldenPath.url=goldenPath.url)
            stopifnot(nrow(ans) == nb_assembled)
            oo <- match(ASSEMBLED_MOLECULES, ans[ , "chrom"])
            stopifnot(!anyNA(oo))
            ans <- S4Vectors:::extract_data_frame_rows(ans, oo)
        } else {
            ans <- GET_CHROM_SIZES(goldenPath.url=goldenPath.url)
            stopifnot(is.data.frame(ans),
                      identical(sapply(ans, class),
                                c(chrom="character", size="integer")),
                      identical(head(ans[ , "chrom"], n=nb_assembled),
                                ASSEMBLED_MOLECULES),
                      is_primary_key(ans[ , "chrom"]))
        }

        ## Add columns "assembled" and "circular".
        assembled <- logical(nrow(ans))
        assembled[assembled_idx] <- TRUE
        circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"],
                                                   vars$CIRC_SEQS)
        ans <- cbind(ans, assembled=assembled, circular=circular)

        .UCSC_cached_chrom_info[[GENOME]] <- ans
    }

    ## Add NCBI cols.
    NCBI_linker <- .get_NCBI_linker(map.NCBI, vars$NCBI_LINKER, GENOME)
    if (!is.null(NCBI_linker))
        ans <- do.call(.add_NCBI_cols_to_UCSC_chrom_info,
                       c(list(ans), NCBI_linker))

    ## Add Ensembl col.
    Ensemb_linker <- .get_Ensembl_linker(add.ensembl.col, vars$ENSEMBL_LINKER,
                                         GENOME)
    if (!is.null(Ensemb_linker))
        ans <- .add_ensembl_column(ans, GENOME, table=Ensemb_linker,
                                   goldenPath.url=goldenPath.url)

    if (assembled.molecules.only)
        ans <- S4Vectors:::extract_data_frame_rows(ans, assembled_idx)

    if (!is.null(NCBI_linker)) {
        assembly_accession <- NCBI_linker$assembly_accession
        NCBI_assembly_info <-
            find_NCBI_assembly_info_for_accession(assembly_accession)
        attr(ans, "NCBI_assembly_info") <- NCBI_assembly_info
    }
    ans
}

### Return a 4-column data.frame with columns "chrom" (character), "size"
### (integer), "assembled" (logical), and "circular" (logical).
getChromInfoFromUCSC <- function(genome,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE,
    as.Seqinfo=FALSE)
{
    if (!isSingleString(genome))
        stop(wmsg("'genome' must be a single string"))
    if (!isTRUEorFALSE(assembled.molecules.only))
        stop(wmsg("'assembled.molecules.only' must be TRUE or FALSE"))
    if (!(isTRUEorFALSE(map.NCBI) ||
          is.list(map.NCBI) && !is.null(names(map.NCBI))))
        stop(wmsg("'map.NCBI' must be TRUE or FALSE ",
                  "(experts can also supply an \"NCBI linker\" ",
                  "as a named list)"))
    if (!(isTRUEorFALSE(add.ensembl.col) ||
          identical(add.ensembl.col, "force")))
        stop(wmsg("'add.ensembl.col' must be TRUE or FALSE ",
                  "(experts can also use \"force\")"))
    if (!isSingleString(goldenPath.url))
        stop(wmsg("'goldenPath.url' must be a single string"))
    if (!isTRUEorFALSE(recache))
        stop(wmsg("'recache' must be TRUE or FALSE"))
    if (!isTRUEorFALSE(as.Seqinfo))
        stop(wmsg("'as.Seqinfo' must be TRUE or FALSE"))

    script_name <- paste0(genome, ".R")
    script_path <- system.file("registered", "UCSC_genomes", script_name,
                               package="GenomeInfoDb")

    if (identical(script_path, "")) {
        if (!isFALSE(map.NCBI))
            warning(wmsg("'map.NCBI' got ignored for unregistered ",
                         "UCSC genome ", genome, " (additional NCBI columns ",
                         "are only available for registered UCSC genomes ",
                         "based on an NCBI assembly)"))
        ans <- .get_chrom_info_for_unregistered_UCSC_genome(genome,
                    assembled.molecules.only=assembled.molecules.only,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    } else {
        ans <- .get_chrom_info_for_registered_UCSC_genome(script_path,
                    assembled.molecules.only=assembled.molecules.only,
                    map.NCBI=map.NCBI,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    }

    if (!as.Seqinfo)
        return(ans)
    Seqinfo(seqnames=ans[ , "chrom"],
            seqlengths=ans[ , "size"],
            isCircular=ans[ , "circular"],
            genome=genome)
}
Any scripts or data that you put into this service are public.
GenomeInfoDb documentation built on April 9, 2021, 6 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in getChromInfoFromUCSC registered_UCSC_genomes

Try the GenomeInfoDb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GenomeInfoDb Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Documented in getChromInfoFromUCSC registered_UCSC_genomes

Try the GenomeInfoDb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/getChromInfoFromUCSC.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style