R/getChromInfoFromUCSC.R

Defines functions getChromInfoFromUCSC .get_chrom_info_for_registered_UCSC_genome .get_Ensembl_linker .get_NCBI_linker .get_chrom_info_for_unregistered_UCSC_genome registered_UCSC_genomes .parse_script_for_registered_UCSC_genome .add_ensembl_column .fetch_Ensembl_column .try_to_fetch_ucsc2ensembl_from_chromAlias .try_to_fetch_ucscToEnsembl .fetch_ucsc2ensembl_from_chromAlias_table .fetch_ucscToEnsembl_table .add_NCBI_cols_to_UCSC_chrom_info .map_UCSC_seqlevels_to_NCBI_seqlevels .match_UCSC_seqlevels_part2_to_NCBI_accns .match_UCSC_seqlevels_to_NCBI_accns

Documented in getChromInfoFromUCSC registered_UCSC_genomes

### =========================================================================
### getChromInfoFromUCSC()
### -------------------------------------------------------------------------
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_NCBI_cols_to_UCSC_chrom_info()
###

.match_UCSC_seqlevels_to_NCBI_accns <- function(UCSC_seqlevels, NCBI_accns,
                                                trim.seqlevels=FALSE,
                                                accn.suffix="")
{
    if (trim.seqlevels) {
        UCSC_seqlevels <- sub("_(alt|random|fix)$", "", UCSC_seqlevels)
        UCSC_seqlevels <- sub("^[^_]*_", "", UCSC_seqlevels)
    }
    x <- chartr("v-", "..", UCSC_seqlevels)
    x <- paste0(x, accn.suffix)
    names(x) <- UCSC_seqlevels
    solid_match2(tolower(x), tolower(NCBI_accns),
                 x_what="UCSC seqlevel", table_what="accession number")
}

.match_UCSC_seqlevels_part2_to_NCBI_accns <-
    function(UCSC_seqlevels, NCBI_accns, part2.auto.version="", accn.prefix="")
{
    ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
    seqlevel_parts <- strsplit(UCSC_seqlevels, "_")
    nparts <- lengths(seqlevel_parts)
    idx2 <- which(nparts >= 2L)
    if (length(idx2) == 0L)
        return(ans)
    offsets <- c(0L, cumsum(nparts[idx2[-length(idx2)]]))
    x <- unlist(seqlevel_parts[idx2], use.names=FALSE)[offsets + 2L]
    x <- chartr("v", ".", x)
    unversioned_idx <- grep(".", x, fixed=TRUE, invert=TRUE)
    if (length(unversioned_idx) != 0L)
        x[unversioned_idx] <- paste0(x[unversioned_idx], part2.auto.version)
    x <- paste0(accn.prefix, x)
    names(x) <- UCSC_seqlevels[idx2]
    ans[idx2] <- solid_match2(tolower(x), tolower(NCBI_accns),
                              x_what="UCSC seqlevel",
                              table_what="accession number")
    ans
}

### The workhorse behind .add_NCBI_cols_to_UCSC_chrom_info().
### - All input vectors must be character vectors.
### - All NCBI input vectors must have the same length.
### - Vectors 'UCSC_seqlevels' and 'NCBI_seqlevels' must be "primary
###   keys" i.e. must not contain NAs, empty strings, or duplicates.
###   (No such assumption is made about the other input vectors.)
### Returns an integer vector parallel to 'UCSC_seqlevels'.
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
                                                  NCBI_seqlevels,
                                                  NCBI_UCSCStyleName,
                                                  NCBI_GenBankAccn,
                                                  NCBI_RefSeqAccn,
                                                  special_mappings=NULL)
{
    stopifnot(is.character(UCSC_seqlevels),
              is_primary_key(UCSC_seqlevels),
              is.character(NCBI_seqlevels),
              is_primary_key(NCBI_seqlevels),
              is.character(NCBI_UCSCStyleName),
              is.character(NCBI_GenBankAccn),
              is.character(NCBI_RefSeqAccn))

    L2R <- rep.int(NA_integer_, length(UCSC_seqlevels))
    unmapped_idx <- seq_along(L2R)

    ## 1. Handle special mappings.
    if (!is.null(special_mappings)) {
        m1 <- match(names(special_mappings), UCSC_seqlevels)
        if (anyNA(m1))
            stop(wmsg("'names(special_mappings)' contains sequence ",
                      "names not found in the UCSC genome"))
        m2 <- match(special_mappings, NCBI_seqlevels)
        if (anyNA(m2))
            stop(wmsg("'special_mappings' contains sequence ",
                      "names not found in the NCBI assembly"))
        L2R[m1] <- m2

        unmapped_idx <- which(is.na(L2R))
        if (length(unmapped_idx) == 0L)
            return(L2R)

        NCBI_UCSCStyleName[m2] <- NA_character_
    }

    ## 2. We assign based on exact match between 'UCSC_seqlevels'
    ##    and 'NCBI_UCSCStyleName'.
    m <- match(UCSC_seqlevels[unmapped_idx], NCBI_UCSCStyleName)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 3. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels.
    ucsc_seqlevels <- tolower(UCSC_seqlevels)
    ncbi_seqlevels <- tolower(NCBI_seqlevels)
    m <- match(ucsc_seqlevels[unmapped_idx], ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 4. We assign based on exact match (case insensitive) between UCSC
    ##    and NCBI seqlevels after removal of the "chr" prefix on both sides.
    nochr_ucsc_seqlevels <- sub("^chr", "", ucsc_seqlevels[unmapped_idx])
    nochr_ncbi_seqlevels <- sub("^chr", "", ncbi_seqlevels)
    m <- match(nochr_ucsc_seqlevels, nochr_ncbi_seqlevels)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 5. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 6. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 7. We assign based on match between UCSC seqlevels and GenBank
    ##    accession numbers after adding ".1" suffix to UCSC seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_GenBankAccn,
                                             accn.suffix=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 8. We assign based on match between UCSC seqlevels and RefSeq
    ##    accession numbers after trimming the first part (e.g. "chr1_")
    ##    and "_random" suffix from the seqlevels.
    m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
                                             NCBI_RefSeqAccn,
                                             trim.seqlevels=TRUE)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 9. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after adding automatic suffix ".1" to the inferred
    ##    GenBank accession number if it's not versioned).
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1")
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 10. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels (after dropping the version from the GenBank accession
    ##    numbers provided by NCBI).
    NCBI_GenBankAccn0 <- sub("\\.[^.]*$", "", NCBI_GenBankAccn)
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn0)
    L2R[unmapped_idx] <- m

    unmapped_idx <- which(is.na(L2R))
    if (length(unmapped_idx) == 0L)
        return(L2R)

    ## 11. We assign based on GenBank accession number inferred from part 2 of
    ##    UCSC seqlevels after adding AAD prefix to it.
    m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
                                        UCSC_seqlevels[unmapped_idx],
                                        NCBI_GenBankAccn,
                                        part2.auto.version=".1",
                                        accn.prefix="AAD")
    L2R[unmapped_idx] <- m

    L2R
}

.add_NCBI_cols_to_UCSC_chrom_info <- function(
    UCSC_chrom_info, assembly_accession,
    AssemblyUnits=NULL, special_mappings=NULL, unmapped_seqs=NULL,
    drop_unmapped=FALSE)
{
    UCSC_seqlevels <- UCSC_chrom_info[ , "chrom"]

    if (length(unmapped_seqs) != 0L) {
        unmapped_seqs_role <- rep.int(names(unmapped_seqs),
                                      lengths(unmapped_seqs))
        unmapped_seqs <- unlist(unmapped_seqs, use.names=FALSE)
        unmapped_idx <- match(unmapped_seqs, UCSC_seqlevels)
        stopifnot(!anyNA(unmapped_idx))
    }

    NCBI_chrom_info <- getChromInfoFromNCBI(assembly_accession,
                                            assembly.units=AssemblyUnits)

    NCBI_seqlevels     <- NCBI_chrom_info[ , "SequenceName"]
    NCBI_UCSCStyleName <- NCBI_chrom_info[ , "UCSCStyleName"]
    NCBI_GenBankAccn   <- NCBI_chrom_info[ , "GenBankAccn"]
    NCBI_RefSeqAccn    <- NCBI_chrom_info[ , "RefSeqAccn"]
    L2R <- .map_UCSC_seqlevels_to_NCBI_seqlevels(
                                       UCSC_seqlevels,
                                       NCBI_seqlevels,
                                       NCBI_UCSCStyleName,
                                       NCBI_GenBankAccn,
                                       NCBI_RefSeqAccn,
                                       special_mappings=special_mappings)
    L2R_is_NA <- is.na(L2R)
    mapped_idx <- which(!L2R_is_NA)

    if (length(unmapped_seqs) != 0L)
        stopifnot(all(L2R_is_NA[unmapped_idx]))

    if (isTRUE(drop_unmapped)) {
        UCSC_chrom_info <-
            S4Vectors:::extract_data_frame_rows(UCSC_chrom_info, mapped_idx)
        L2R <- L2R[mapped_idx]
        mapped_idx <- seq_along(L2R)
        ## Before we drop the unmapped UCSC seqlevels, we want to make sure
        ## that all the NCBI seqlevels are reverse mapped. For example, in
        ## the case of hg38, the chromInfo table at UCSC contains sequences
        ## that belong to GRCh38.p12 but not to GRCh38. So we want to make
        ## sure that after we drop these "foreign sequences", we are left
        ## with a one-to-one mapping between UCSC seqlevels and the 455 NCBI
        ## seqlevels that are in the assembly report for GRCh38.
        idx <- setdiff(seq_along(NCBI_seqlevels), L2R)
        if (length(idx) != 0L) {
            in1string <- paste0(NCBI_seqlevels[idx], collapse=", ")
            stop(wmsg("no UCSC seqlevel could be mapped to the following ",
                      "NCBI seqlevel(s): ", in1string))
        }
    } else {
        unexpectedly_unmapped_idx <-
            which(L2R_is_NA & !(UCSC_seqlevels %in% unmapped_seqs))
        if (length(unexpectedly_unmapped_idx) != 0L) {
            in1string <- paste0(UCSC_seqlevels[unexpectedly_unmapped_idx],
                                collapse=", ")
            stop(wmsg("cannot map the following UCSC seqlevel(s) to an ",
                      "NCBI seqlevel: ", in1string))
        }
    }

    NCBI_chrom_info <- S4Vectors:::extract_data_frame_rows(NCBI_chrom_info, L2R)

    ## NCBI columns "circular", "SequenceLength", and "UCSCStyleName"
    ## are expected to be redundant with UCSC columns "circular", "size",
    ## and "chrom", respectively. Let's make sure these 3 NCBI columns
    ## agree with their UCSC counterpart before dropping them.

    stopifnot(identical(UCSC_chrom_info[mapped_idx, "circular"],
                        NCBI_chrom_info[mapped_idx, "circular"]))

    compare_idx <- which(!is.na(NCBI_chrom_info[ , "SequenceLength"]))
    stopifnot(identical(UCSC_chrom_info[compare_idx, "size"],
                        NCBI_chrom_info[compare_idx, "SequenceLength"]))

    ## According to the UCSC-style-name column of the assembly report for
    ## GRCh37.p13 (accession GCF_000001405.25), MT in GRCh37.p13 is mapped
    ## to chrM in hg19, which is wrong! So we skip this sanity check for
    ## GCF_000001405.25.
    if (assembly_accession != "GCF_000001405.25") {
        compare_idx <- which(!is.na(NCBI_chrom_info[ , "UCSCStyleName"]))
        stopifnot(identical(UCSC_chrom_info[compare_idx, "chrom"],
                            NCBI_chrom_info[compare_idx, "UCSCStyleName"]))
    }

    drop_columns <- c("SequenceLength", "UCSCStyleName", "circular")
    NCBI_chrom_info <- drop_cols(NCBI_chrom_info, drop_columns)
    colnames(NCBI_chrom_info) <- paste0("NCBI.", colnames(NCBI_chrom_info))
    ans <- cbind(UCSC_chrom_info, NCBI_chrom_info)

    if (length(unmapped_seqs) != 0L)
        ans[unmapped_idx, "NCBI.SequenceRole"] <- unmapped_seqs_role

    stopifnot(!is.unsorted(ans[ , "NCBI.SequenceRole"]))
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_ensembl_column()
###

.fetch_ucscToEnsembl_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ucsc="character", ensembl="character")
    fetch_table_from_UCSC(genome, "ucscToEnsembl",
                          col2class=col2class,
                          goldenPath.url=goldenPath.url)
}

### Filters and reformats the chromAlias data to so it's returned in the
### same format as the ucscToEnsembl table (i.e. 2-column data frame with
### columns "ucsc" and "ensembl").
.fetch_ucsc2ensembl_from_chromAlias_table <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    col2class <- c(ensembl="character", ucsc="character", source="factor")
    chromAlias <- fetch_table_from_UCSC(genome, "chromAlias",
                                        col2class=col2class,
                                        goldenPath.url=goldenPath.url)
    ## Filters and reformats.
    keep_idx <- grep("ensembl", chromAlias[ , "source"], fixed=TRUE)
    ucsc2ensembl <- chromAlias[keep_idx, c("ucsc", "ensembl")]

    ## In some **very rare** situations, it seems that the chromAlias table
    ## can contain ambiguous ucsc-to-ensembl mappings where a given UCSC
    ## chromosome name is mapped to more than one Ensembl name. For example
    ## this happens for rn6 where unplaced scaffold chrUn_AABR07022993v1 is
    ## mapped to Ensembl names AABR07022518.1, AABR07023006.1, AABR07022993.1.
    ## Note that as of Jan 2020, the chromAlias table for rn6 is the only
    ## known case of ambiguous ucsc-to-ensembl mapping.
    if (anyDuplicated(ucsc2ensembl[ , "ucsc"])) {
        ## The disambiguation algorithm below assumes that the ambiguous
        ## ucsc-to-ensembl mappings are of the form xxYYY-to-ZZZ where
        ## ZZZ is a GenBank accession, YYY a GenBank accession where the
        ## dot (".") has been replaced with a "v" and xx can be any suffix.
        ## To disambiguate, we only keep mappings where YYY and ZZZ are the
        ## same (modulo the dot/v substitution).
        idx <- which(duplicated(ucsc2ensembl[ , "ucsc"]) |
                     duplicated(ucsc2ensembl[ , "ucsc"], fromLast=TRUE))
        ensembl <- chartr(".", "v", ucsc2ensembl[idx , "ensembl"])
        ucsc <- ucsc2ensembl[idx , "ucsc"]
        ucsc_nc <- nchar(ucsc)
        ucsc_suffix <- substr(ucsc, ucsc_nc - nchar(ensembl) + 1L, ucsc_nc)
        ok <- ucsc_suffix == ensembl
        if (sum(ok) == 0L)
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (!all(ucsc %in% ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        if (anyDuplicated(ucsc[ok]))
            stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
        drop_idx <- idx[!ok]
        ucsc2ensembl <- ucsc2ensembl[-drop_idx, , drop=FALSE]
    }
    ucsc2ensembl
}

.try_to_fetch_ucscToEnsembl <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'ucscToEnsembl' ... ", appendLF=FALSE)
    ucscToEnsembl <- try(
        suppressWarnings(
            .fetch_ucscToEnsembl_table(genome, goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucscToEnsembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    message("OK")
    ucscToEnsembl
}

.try_to_fetch_ucsc2ensembl_from_chromAlias <- function(genome,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    message("Trying 'chromAlias' ... ", appendLF=FALSE)
    ucsc2ensembl <- try(
        suppressWarnings(
            .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
        ), silent=TRUE)
    if (inherits(ucsc2ensembl, "try-error")) {
        message("FAILED! (table does not exist)")
        return(NULL)
    }
    if (nrow(ucsc2ensembl) == 0L) {
        message("FAILED! (table exists but has no Ensembl information)")
        return(NULL)
    }
    message("OK")
    ucsc2ensembl
}

.fetch_Ensembl_column <- function(UCSC_chroms, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    if (identical(table, "ucscToEnsembl")) {
        ucsc2ensembl <- .fetch_ucscToEnsembl_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, "chromAlias")) {
        ucsc2ensembl <- .fetch_ucsc2ensembl_from_chromAlias_table(genome,
                                                goldenPath.url=goldenPath.url)
    } else if (identical(table, NA)) {
        table <- "ucscToEnsembl"
        ucsc2ensembl <- .try_to_fetch_ucscToEnsembl(genome,
                                      goldenPath.url=goldenPath.url)
        if (is.null(ucsc2ensembl)) {
            table <- "chromAlias"
            ucsc2ensembl <- .try_to_fetch_ucsc2ensembl_from_chromAlias(genome,
                                      goldenPath.url=goldenPath.url)
            if (is.null(ucsc2ensembl)) {
                warning(wmsg("No Ensembl sequence names could be ",
                             "found for UCSC genome ", genome))
                return(rep.int(NA_character_, length(UCSC_chroms)))
            }
        }
    } else {
        stop(wmsg("supplied 'table' is not supported"))
    }
    oo <- match(UCSC_chroms, ucsc2ensembl[ , "ucsc"])
    if (is.null(table) && anyNA(oo))
        warning(wmsg("incomplete data in '", table, "' table for ",
                     "UCSC genome ", genome, ": some UCSC sequence ",
                     "names are missing"))
    ucsc2ensembl[oo , "ensembl"]
}

.add_ensembl_column <- function(UCSC_chrom_info, genome, table=NA,
    goldenPath.url=getOption("UCSC.goldenPath.url"))
{
    ensembl <- .fetch_Ensembl_column(UCSC_chrom_info[ , "chrom"],
                                     genome, table=table,
                                     goldenPath.url=goldenPath.url)
    cbind(UCSC_chrom_info, ensembl=ensembl, stringsAsFactors=FALSE)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### registered_UCSC_genomes()
###

.parse_script_for_registered_UCSC_genome <- function(script_path)
{
    filename <- basename(script_path)
    if (substr(filename, nchar(filename)-1L, nchar(filename)) != ".R")
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "have extension .R"))
    if (grepl(" ", filename, fixed=TRUE))
        stop(wmsg("name of genome registration file '", filename, "' must ",
                  "not contain spaces"))

    ## Placeholders. Will actually get defined when we source the script.
    ## See README.TXT in inst/registered/UCSC_genomes/ for the list of
    ## variables.
    GENOME <- ORGANISM <- ASSEMBLED_MOLECULES <- CIRC_SEQS <- NULL
    GET_CHROM_SIZES <- NCBI_LINKER <- ENSEMBL_LINKER <- NULL
    source(script_path, local=TRUE)

    stop_if <- function(notok, ...) {
        if (notok)
            stop("Error in UCSC genome registration file '", filename,
                 "':\n  ", wmsg(...))
    }

    ## Check GENOME.
    stop_if(is.null(GENOME), "'GENOME' must be defined")
    stop_if(!isSingleString(GENOME), "'GENOME' must be a single string")
    target <- substr(filename, 1L, nchar(filename)-2L)
    stop_if(!identical(target, GENOME), "'GENOME' must match filename")

    ## Check ORGANISM.
    stop_if(is.null(ORGANISM), "'ORGANISM' must be defined")
    stop_if(!isSingleString(ORGANISM), "'ORGANISM' must be a single string")
    stop_if(grepl("_", ORGANISM, fixed=TRUE),
            "underscores are not allowed in 'ORGANISM' (use spaces instead)")

    ## Check ASSEMBLED_MOLECULES.
    stop_if(is.null(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be defined")
    stop_if(!is.character(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must be a character vector")
    stop_if(!is_primary_key(ASSEMBLED_MOLECULES),
            "'ASSEMBLED_MOLECULES' must ",
            "not contain NAs, empty strings, or duplicates")

    ## Check CIRC_SEQS.
    stop_if(is.null(CIRC_SEQS),
            "'CIRC_SEQS' must be defined")
    stop_if(!is.character(CIRC_SEQS),
            "'CIRC_SEQS' must be a character vector")
    stop_if(anyDuplicated(CIRC_SEQS),
            "'CIRC_SEQS' must not contain duplicates")
    stop_if(!all(CIRC_SEQS %in% ASSEMBLED_MOLECULES),
            "'CIRC_SEQS' must be a subset of 'ASSEMBLED_MOLECULES'")

    ## Check GET_CHROM_SIZES.
    if (!is.null(GET_CHROM_SIZES)) {
        stop_if(!is.function(GET_CHROM_SIZES),
                "when defined, 'GET_CHROM_SIZES' must be a function")
    }

    ## Check NCBI_LINKER.
    if (!is.null(NCBI_LINKER)) {
        stop_if(!is.list(NCBI_LINKER),
                "when defined, 'NCBI_LINKER' must be a named list")
        linker_fields <- names(NCBI_LINKER)
        stop_if(!is.character(linker_fields),
                "when defined, 'NCBI_LINKER' must be a named list")
        stop_if(!is_primary_key(linker_fields),
                "the names on 'NCBI_LINKER' must ",
                "not contain NAs, empty strings, or duplicates")
        stop_if(!("assembly_accession" %in% linker_fields),
                "'NCBI_LINKER' must have field \"assembly_accession\"")
        accession <- NCBI_LINKER$assembly_accession
        stop_if(!isSingleString(accession) || accession == "",
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be a single non-empty string")
        NCBI_assembly_info <- find_NCBI_assembly_info_for_accession(accession)
        stop_if(is.null(NCBI_assembly_info),
                "\"assembly_accession\" field in 'NCBI_LINKER' must ",
                "be associated with a registered NCBI assembly")
        stop_if(!identical(ORGANISM, NCBI_assembly_info$organism),
                "the NCBI assembly associated with the \"assembly_accession\" ",
                "field in 'NCBI_LINKER' is registered for an organism ",
                "(\"", NCBI_assembly_info$organism, "\") that differs ",
                "from 'ORGANISM' (\"", ORGANISM, "\")")
    }

    ## Check ENSEMBL_LINKER.
    if (!is.null(ENSEMBL_LINKER)) {
        stop_if(!isSingleString(ENSEMBL_LINKER) || ENSEMBL_LINKER == "",
                "when defined, 'ENSEMBL_LINKER' must ",
                "be a single non-empty string")
    }

    list(GENOME=GENOME,
         ORGANISM=ORGANISM,
         ASSEMBLED_MOLECULES=ASSEMBLED_MOLECULES,
         CIRC_SEQS=CIRC_SEQS,
         GET_CHROM_SIZES=GET_CHROM_SIZES,
         NCBI_LINKER=NCBI_LINKER,
         ENSEMBL_LINKER=ENSEMBL_LINKER)
}

registered_UCSC_genomes <- function()
{
    dir_path <- system.file("registered", "UCSC_genomes",
                             package="GenomeInfoDb")
    file_paths <- list.files(dir_path, pattern="\\.R$", full.names=TRUE)
    assemblies <- lapply(file_paths, .parse_script_for_registered_UCSC_genome)

    colnames <- c("organism", "genome", "NCBI_assembly",
                  "assembly_accession", "with_Ensembl", "circ_seqs")
    make_col <- function(j) {
        colname <- colnames[[j]]
        if (colname == "NCBI_assembly") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    accession <- linker$assembly_accession
                    find_NCBI_assembly_info_for_accession(accession)$assembly
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "assembly_accession") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$NCBI_LINKER
                    if (is.null(linker))
                        return(NA_character_)
                    linker$assembly_accession
                }, character(1), USE.NAMES=FALSE)
            return(col)
        }
        if (colname == "with_Ensembl") {
            col <- vapply(assemblies,
                function(assembly_info) {
                    linker <- assembly_info$ENSEMBL_LINKER
                    !is.null(linker)
                }, logical(1), USE.NAMES=FALSE)
            return(col)
        }
        COLNAME <- toupper(colname)
        col0 <- lapply(assemblies, `[[`, COLNAME)
        if (colname == "circ_seqs")
            return(CharacterList(col0))
        stopifnot(all(lengths(col0) == 1L))
        col <- as.character(unlist(col0, use.names=FALSE))
        if (colname == "organism")
            col <- factor(col)  # order of levels will dictate order
                                # of rows in final DataFrame
        col
    }

    listData <- lapply(setNames(seq_along(colnames), colnames), make_col)
    ans <- S4Vectors:::new_DataFrame(listData, nrows=length(assemblies))
    genome_trailing_digits <- sub("(.*[^0-9])([0-9]*)$", "\\2", ans$genome)
    oo <- order(ans$organism, as.integer(genome_trailing_digits))
    as.data.frame(ans[oo, , drop=FALSE])
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getChromInfoFromUCSC()
###

.UCSC_cached_chrom_info <- new.env(parent=emptyenv())

.get_chrom_info_for_unregistered_UCSC_genome <- function(genome,
    assembled.molecules.only=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    ans <- .UCSC_cached_chrom_info[[genome]]
    if (is.null(ans) || recache) {
        ans <- try(suppressWarnings(
                       fetch_chrom_sizes_from_UCSC(genome,
                           goldenPath.url=goldenPath.url)),
                   silent=TRUE)
        if (inherits(ans, "try-error"))
            stop(wmsg("unknown UCSC genome: ", genome))
        oo <- orderSeqlevels(ans[ , "chrom"])
        ans <- S4Vectors:::extract_data_frame_rows(ans, oo)

        ## Add columns "assembled" and "circular".
        assembled <- rep.int(NA, nrow(ans))
        circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"])
        ans <- cbind(ans, assembled=assembled, circular=circular)

        .UCSC_cached_chrom_info[[genome]] <- ans
    }
    if (isTRUE(add.ensembl.col)) {
        warning(wmsg("'add.ensembl.col' got ignored for unregistered ",
                     "UCSC genome ", genome, "."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    } else if (identical(add.ensembl.col, "force")) {
        ans <- .add_ensembl_column(ans, genome,
                                   goldenPath.url=goldenPath.url)
    }
    if (assembled.molecules.only)
        warning(wmsg("'assembled.molecules' got ignored for unregistered ",
                     "UCSC genome ", genome, " (don't know what the ",
                     "assembled molecules are for an unregistered UCSC ",
                     "genome)"))
    ans
}

.get_NCBI_linker <- function(map.NCBI, linker, GENOME)
{
    if (is.list(map.NCBI))
        return(map.NCBI)
    if (isFALSE(map.NCBI))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'map.NCBI' got ignored for registered ",
                     "UCSC genome ", GENOME, " (additional NCBI columns ",
                     "are only available for registered UCSC genomes ",
                     "based on an NCBI assembly)"))
    linker
}

.get_Ensembl_linker <- function(add.ensembl.col, linker, GENOME)
{
    if (identical(add.ensembl.col, "force"))
        return(NA)
    if (isFALSE(add.ensembl.col))
        return(NULL)
    if (is.null(linker))
        warning(wmsg("'add.ensembl.col' got ignored for registered ",
                     "UCSC genome ", GENOME, " (genome was registered ",
                     "with no Ensembl sequence names linked to it)."),
                     "\n  ",
                wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
                     "the \"ensembl\" column anyway."))
    linker
}

.get_chrom_info_for_registered_UCSC_genome <- function(script_path,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE)
{
    vars <- .parse_script_for_registered_UCSC_genome(script_path)
    GENOME <- vars$GENOME
    ASSEMBLED_MOLECULES <- vars$ASSEMBLED_MOLECULES

    nb_assembled <- length(ASSEMBLED_MOLECULES)
    assembled_idx <- seq_len(nb_assembled)

    ans <- .UCSC_cached_chrom_info[[GENOME]]
    if (is.null(ans) || recache) {
        GET_CHROM_SIZES <- vars$GET_CHROM_SIZES
        if (is.null(GET_CHROM_SIZES)) {
            ans <- fetch_chrom_sizes_from_UCSC(GENOME,
                                               goldenPath.url=goldenPath.url)
            stopifnot(nrow(ans) == nb_assembled)
            oo <- match(ASSEMBLED_MOLECULES, ans[ , "chrom"])
            stopifnot(!anyNA(oo))
            ans <- S4Vectors:::extract_data_frame_rows(ans, oo)
        } else {
            ans <- GET_CHROM_SIZES(goldenPath.url=goldenPath.url)
            stopifnot(is.data.frame(ans),
                      identical(sapply(ans, class),
                                c(chrom="character", size="integer")),
                      identical(head(ans[ , "chrom"], n=nb_assembled),
                                ASSEMBLED_MOLECULES),
                      is_primary_key(ans[ , "chrom"]))
        }

        ## Add columns "assembled" and "circular".
        assembled <- logical(nrow(ans))
        assembled[assembled_idx] <- TRUE
        circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"],
                                                   vars$CIRC_SEQS)
        ans <- cbind(ans, assembled=assembled, circular=circular)

        .UCSC_cached_chrom_info[[GENOME]] <- ans
    }

    ## Add NCBI cols.
    NCBI_linker <- .get_NCBI_linker(map.NCBI, vars$NCBI_LINKER, GENOME)
    if (!is.null(NCBI_linker))
        ans <- do.call(.add_NCBI_cols_to_UCSC_chrom_info,
                       c(list(ans), NCBI_linker))

    ## Add Ensembl col.
    Ensemb_linker <- .get_Ensembl_linker(add.ensembl.col, vars$ENSEMBL_LINKER,
                                         GENOME)
    if (!is.null(Ensemb_linker))
        ans <- .add_ensembl_column(ans, GENOME, table=Ensemb_linker,
                                   goldenPath.url=goldenPath.url)

    if (assembled.molecules.only)
        ans <- S4Vectors:::extract_data_frame_rows(ans, assembled_idx)

    if (!is.null(NCBI_linker)) {
        assembly_accession <- NCBI_linker$assembly_accession
        NCBI_assembly_info <-
            find_NCBI_assembly_info_for_accession(assembly_accession)
        attr(ans, "NCBI_assembly_info") <- NCBI_assembly_info
    }
    ans
}

### Return a 4-column data.frame with columns "chrom" (character), "size"
### (integer), "assembled" (logical), and "circular" (logical).
getChromInfoFromUCSC <- function(genome,
    assembled.molecules.only=FALSE,
    map.NCBI=FALSE,
    add.ensembl.col=FALSE,
    goldenPath.url=getOption("UCSC.goldenPath.url"),
    recache=FALSE,
    as.Seqinfo=FALSE)
{
    if (!isSingleString(genome))
        stop(wmsg("'genome' must be a single string"))
    if (!isTRUEorFALSE(assembled.molecules.only))
        stop(wmsg("'assembled.molecules.only' must be TRUE or FALSE"))
    if (!(isTRUEorFALSE(map.NCBI) ||
          is.list(map.NCBI) && !is.null(names(map.NCBI))))
        stop(wmsg("'map.NCBI' must be TRUE or FALSE ",
                  "(experts can also supply an \"NCBI linker\" ",
                  "as a named list)"))
    if (!(isTRUEorFALSE(add.ensembl.col) ||
          identical(add.ensembl.col, "force")))
        stop(wmsg("'add.ensembl.col' must be TRUE or FALSE ",
                  "(experts can also use \"force\")"))
    if (!isSingleString(goldenPath.url))
        stop(wmsg("'goldenPath.url' must be a single string"))
    if (!isTRUEorFALSE(recache))
        stop(wmsg("'recache' must be TRUE or FALSE"))
    if (!isTRUEorFALSE(as.Seqinfo))
        stop(wmsg("'as.Seqinfo' must be TRUE or FALSE"))

    script_name <- paste0(genome, ".R")
    script_path <- system.file("registered", "UCSC_genomes", script_name,
                               package="GenomeInfoDb")

    if (identical(script_path, "")) {
        if (!isFALSE(map.NCBI))
            warning(wmsg("'map.NCBI' got ignored for unregistered ",
                         "UCSC genome ", genome, " (additional NCBI columns ",
                         "are only available for registered UCSC genomes ",
                         "based on an NCBI assembly)"))
        ans <- .get_chrom_info_for_unregistered_UCSC_genome(genome,
                    assembled.molecules.only=assembled.molecules.only,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    } else {
        ans <- .get_chrom_info_for_registered_UCSC_genome(script_path,
                    assembled.molecules.only=assembled.molecules.only,
                    map.NCBI=map.NCBI,
                    add.ensembl.col=add.ensembl.col,
                    goldenPath.url=goldenPath.url,
                    recache=recache)
    }

    if (!as.Seqinfo)
        return(ans)
    Seqinfo(seqnames=ans[ , "chrom"],
            seqlengths=ans[ , "size"],
            isCircular=ans[ , "circular"],
            genome=genome)
}

Try the GenomeInfoDb package in your browser

Any scripts or data that you put into this service are public.

GenomeInfoDb documentation built on April 9, 2021, 6 p.m.