Nothing
### =========================================================================
### getChromInfoFromUCSC()
### -------------------------------------------------------------------------
###
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_NCBI_cols_to_UCSC_chrom_info()
###
.match_UCSC_seqlevels_to_NCBI_accns <- function(UCSC_seqlevels, NCBI_accns,
trim.seqlevels=FALSE,
accn.suffix="")
{
if (trim.seqlevels) {
UCSC_seqlevels <- sub("_(alt|random|fix)$", "", UCSC_seqlevels)
UCSC_seqlevels <- sub("^[^_]*_", "", UCSC_seqlevels)
}
x <- chartr("v-", "..", UCSC_seqlevels)
x <- paste0(x, accn.suffix)
names(x) <- UCSC_seqlevels
solid_match2(tolower(x), tolower(NCBI_accns),
x_what="UCSC seqlevel", table_what="accession number")
}
.match_UCSC_seqlevels_part2_to_NCBI_accns <-
function(UCSC_seqlevels, NCBI_accns, part2.auto.version="", accn.prefix="")
{
ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
seqlevel_parts <- strsplit(UCSC_seqlevels, "_")
nparts <- lengths(seqlevel_parts)
idx2 <- which(nparts >= 2L)
if (length(idx2) == 0L)
return(ans)
offsets <- c(0L, cumsum(nparts[idx2[-length(idx2)]]))
x <- unlist(seqlevel_parts[idx2], use.names=FALSE)[offsets + 2L]
x <- chartr("v", ".", x)
unversioned_idx <- grep(".", x, fixed=TRUE, invert=TRUE)
if (length(unversioned_idx) != 0L)
x[unversioned_idx] <- paste0(x[unversioned_idx], part2.auto.version)
x <- paste0(accn.prefix, x)
names(x) <- UCSC_seqlevels[idx2]
ans[idx2] <- solid_match2(tolower(x), tolower(NCBI_accns),
x_what="UCSC seqlevel",
table_what="accession number")
ans
}
### The workhorse behind .add_NCBI_cols_to_UCSC_chrom_info().
### - All input vectors must be character vectors.
### - All NCBI input vectors must have the same length.
### - Vectors 'UCSC_seqlevels' and 'NCBI_seqlevels' must be "primary
### keys" i.e. must not contain NAs, empty strings, or duplicates.
### (No such assumption is made about the other input vectors.)
### Returns an integer vector parallel to 'UCSC_seqlevels'.
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
NCBI_seqlevels,
NCBI_UCSCStyleName,
NCBI_GenBankAccn,
NCBI_RefSeqAccn,
special_mappings=NULL)
{
stopifnot(is.character(UCSC_seqlevels),
is_primary_key(UCSC_seqlevels),
is.character(NCBI_seqlevels),
is_primary_key(NCBI_seqlevels),
is.character(NCBI_UCSCStyleName),
is.character(NCBI_GenBankAccn),
is.character(NCBI_RefSeqAccn))
L2R <- rep.int(NA_integer_, length(UCSC_seqlevels))
unmapped_idx <- seq_along(L2R)
## 1. Handle special mappings.
if (!is.null(special_mappings)) {
m1 <- match(names(special_mappings), UCSC_seqlevels)
if (anyNA(m1))
stop(wmsg("'names(special_mappings)' contains sequence ",
"names not found in the UCSC genome"))
m2 <- match(special_mappings, NCBI_seqlevels)
if (anyNA(m2))
stop(wmsg("'special_mappings' contains sequence ",
"names not found in the NCBI assembly"))
L2R[m1] <- m2
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
NCBI_UCSCStyleName[m2] <- NA_character_
}
## 2. We assign based on exact match between 'UCSC_seqlevels'
## and 'NCBI_UCSCStyleName'.
m <- match(UCSC_seqlevels[unmapped_idx], NCBI_UCSCStyleName)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 3. We assign based on exact match (case insensitive) between UCSC
## and NCBI seqlevels.
ucsc_seqlevels <- tolower(UCSC_seqlevels)
ncbi_seqlevels <- tolower(NCBI_seqlevels)
m <- match(ucsc_seqlevels[unmapped_idx], ncbi_seqlevels)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 4. We assign based on exact match (case insensitive) between UCSC
## and NCBI seqlevels after removal of the "chr" prefix on both sides.
nochr_ucsc_seqlevels <- sub("^chr", "", ucsc_seqlevels[unmapped_idx])
nochr_ncbi_seqlevels <- sub("^chr", "", ncbi_seqlevels)
m <- match(nochr_ucsc_seqlevels, nochr_ncbi_seqlevels)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 5. We assign based on match between UCSC seqlevels and RefSeq
## accession numbers.
m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
NCBI_RefSeqAccn)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 6. We assign based on match between UCSC seqlevels and GenBank
## accession numbers.
m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
NCBI_GenBankAccn)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 7. We assign based on match between UCSC seqlevels and GenBank
## accession numbers after adding ".1" suffix to UCSC seqlevels.
m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
NCBI_GenBankAccn,
accn.suffix=".1")
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 8. We assign based on match between UCSC seqlevels and RefSeq
## accession numbers after trimming the first part (e.g. "chr1_")
## and "_random" suffix from the seqlevels.
m <- .match_UCSC_seqlevels_to_NCBI_accns(UCSC_seqlevels[unmapped_idx],
NCBI_RefSeqAccn,
trim.seqlevels=TRUE)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 9. We assign based on GenBank accession number inferred from part 2 of
## UCSC seqlevels (after adding automatic suffix ".1" to the inferred
## GenBank accession number if it's not versioned).
m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
UCSC_seqlevels[unmapped_idx],
NCBI_GenBankAccn,
part2.auto.version=".1")
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 10. We assign based on GenBank accession number inferred from part 2 of
## UCSC seqlevels (after dropping the version from the GenBank accession
## numbers provided by NCBI).
NCBI_GenBankAccn0 <- sub("\\.[^.]*$", "", NCBI_GenBankAccn)
m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
UCSC_seqlevels[unmapped_idx],
NCBI_GenBankAccn0)
L2R[unmapped_idx] <- m
unmapped_idx <- which(is.na(L2R))
if (length(unmapped_idx) == 0L)
return(L2R)
## 11. We assign based on GenBank accession number inferred from part 2 of
## UCSC seqlevels after adding AAD prefix to it.
m <- .match_UCSC_seqlevels_part2_to_NCBI_accns(
UCSC_seqlevels[unmapped_idx],
NCBI_GenBankAccn,
part2.auto.version=".1",
accn.prefix="AAD")
L2R[unmapped_idx] <- m
L2R
}
.add_NCBI_cols_to_UCSC_chrom_info <- function(
UCSC_chrom_info, assembly_accession,
AssemblyUnits=NULL, special_mappings=NULL, unmapped_seqs=NULL,
drop_unmapped=FALSE)
{
UCSC_seqlevels <- UCSC_chrom_info[ , "chrom"]
if (length(unmapped_seqs) != 0L) {
unmapped_seqs_role <- rep.int(names(unmapped_seqs),
lengths(unmapped_seqs))
unmapped_seqs <- unlist(unmapped_seqs, use.names=FALSE)
unmapped_idx <- match(unmapped_seqs, UCSC_seqlevels)
stopifnot(!anyNA(unmapped_idx))
}
NCBI_chrom_info <- getChromInfoFromNCBI(assembly_accession,
assembly.units=AssemblyUnits)
NCBI_seqlevels <- NCBI_chrom_info[ , "SequenceName"]
NCBI_UCSCStyleName <- NCBI_chrom_info[ , "UCSCStyleName"]
NCBI_GenBankAccn <- NCBI_chrom_info[ , "GenBankAccn"]
NCBI_RefSeqAccn <- NCBI_chrom_info[ , "RefSeqAccn"]
L2R <- .map_UCSC_seqlevels_to_NCBI_seqlevels(
UCSC_seqlevels,
NCBI_seqlevels,
NCBI_UCSCStyleName,
NCBI_GenBankAccn,
NCBI_RefSeqAccn,
special_mappings=special_mappings)
L2R_is_NA <- is.na(L2R)
mapped_idx <- which(!L2R_is_NA)
if (length(unmapped_seqs) != 0L)
stopifnot(all(L2R_is_NA[unmapped_idx]))
if (isTRUE(drop_unmapped)) {
UCSC_chrom_info <-
S4Vectors:::extract_data_frame_rows(UCSC_chrom_info, mapped_idx)
L2R <- L2R[mapped_idx]
mapped_idx <- seq_along(L2R)
## Before we drop the unmapped UCSC seqlevels, we want to make sure
## that all the NCBI seqlevels are reverse mapped. For example, in
## the case of hg38, the chromInfo table at UCSC contains sequences
## that belong to GRCh38.p12 but not to GRCh38. So we want to make
## sure that after we drop these "foreign sequences", we are left
## with a one-to-one mapping between UCSC seqlevels and the 455 NCBI
## seqlevels that are in the assembly report for GRCh38.
idx <- setdiff(seq_along(NCBI_seqlevels), L2R)
if (length(idx) != 0L) {
in1string <- paste0(NCBI_seqlevels[idx], collapse=", ")
stop(wmsg("no UCSC seqlevel could be mapped to the following ",
"NCBI seqlevel(s): ", in1string))
}
} else {
unexpectedly_unmapped_idx <-
which(L2R_is_NA & !(UCSC_seqlevels %in% unmapped_seqs))
if (length(unexpectedly_unmapped_idx) != 0L) {
in1string <- paste0(UCSC_seqlevels[unexpectedly_unmapped_idx],
collapse=", ")
stop(wmsg("cannot map the following UCSC seqlevel(s) to an ",
"NCBI seqlevel: ", in1string))
}
}
NCBI_chrom_info <- S4Vectors:::extract_data_frame_rows(NCBI_chrom_info, L2R)
## NCBI columns "circular", "SequenceLength", and "UCSCStyleName"
## are expected to be redundant with UCSC columns "circular", "size",
## and "chrom", respectively. Let's make sure these 3 NCBI columns
## agree with their UCSC counterpart before dropping them.
stopifnot(identical(UCSC_chrom_info[mapped_idx, "circular"],
NCBI_chrom_info[mapped_idx, "circular"]))
compare_idx <- which(!is.na(NCBI_chrom_info[ , "SequenceLength"]))
stopifnot(identical(UCSC_chrom_info[compare_idx, "size"],
NCBI_chrom_info[compare_idx, "SequenceLength"]))
## According to the UCSC-style-name column of the assembly report for
## GRCh37.p13 (accession GCF_000001405.25), MT in GRCh37.p13 is mapped
## to chrM in hg19, which is wrong! So we skip this sanity check for
## GCF_000001405.25.
if (assembly_accession != "GCF_000001405.25") {
compare_idx <- which(!is.na(NCBI_chrom_info[ , "UCSCStyleName"]))
stopifnot(identical(UCSC_chrom_info[compare_idx, "chrom"],
NCBI_chrom_info[compare_idx, "UCSCStyleName"]))
}
drop_columns <- c("SequenceLength", "UCSCStyleName", "circular")
NCBI_chrom_info <- drop_cols(NCBI_chrom_info, drop_columns)
colnames(NCBI_chrom_info) <- paste0("NCBI.", colnames(NCBI_chrom_info))
ans <- cbind(UCSC_chrom_info, NCBI_chrom_info)
if (length(unmapped_seqs) != 0L)
ans[unmapped_idx, "NCBI.SequenceRole"] <- unmapped_seqs_role
stopifnot(!is.unsorted(ans[ , "NCBI.SequenceRole"]))
ans
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .add_ensembl_column()
###
.fetch_ucscToEnsembl_table <- function(genome,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
col2class <- c(ucsc="character", ensembl="character")
fetch_table_from_UCSC(genome, "ucscToEnsembl",
col2class=col2class,
goldenPath.url=goldenPath.url)
}
### Filters and reformats the chromAlias data to so it's returned in the
### same format as the ucscToEnsembl table (i.e. 2-column data frame with
### columns "ucsc" and "ensembl").
.fetch_ucsc2ensembl_from_chromAlias_table <- function(genome,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
col2class <- c(ensembl="character", ucsc="character", source="factor")
chromAlias <- fetch_table_from_UCSC(genome, "chromAlias",
col2class=col2class,
goldenPath.url=goldenPath.url)
## Filters and reformats.
keep_idx <- grep("ensembl", chromAlias[ , "source"], fixed=TRUE)
ucsc2ensembl <- chromAlias[keep_idx, c("ucsc", "ensembl")]
## In some **very rare** situations, it seems that the chromAlias table
## can contain ambiguous ucsc-to-ensembl mappings where a given UCSC
## chromosome name is mapped to more than one Ensembl name. For example
## this happens for rn6 where unplaced scaffold chrUn_AABR07022993v1 is
## mapped to Ensembl names AABR07022518.1, AABR07023006.1, AABR07022993.1.
## Note that as of Jan 2020, the chromAlias table for rn6 is the only
## known case of ambiguous ucsc-to-ensembl mapping.
if (anyDuplicated(ucsc2ensembl[ , "ucsc"])) {
## The disambiguation algorithm below assumes that the ambiguous
## ucsc-to-ensembl mappings are of the form xxYYY-to-ZZZ where
## ZZZ is a GenBank accession, YYY a GenBank accession where the
## dot (".") has been replaced with a "v" and xx can be any suffix.
## To disambiguate, we only keep mappings where YYY and ZZZ are the
## same (modulo the dot/v substitution).
idx <- which(duplicated(ucsc2ensembl[ , "ucsc"]) |
duplicated(ucsc2ensembl[ , "ucsc"], fromLast=TRUE))
ensembl <- chartr(".", "v", ucsc2ensembl[idx , "ensembl"])
ucsc <- ucsc2ensembl[idx , "ucsc"]
ucsc_nc <- nchar(ucsc)
ucsc_suffix <- substr(ucsc, ucsc_nc - nchar(ensembl) + 1L, ucsc_nc)
ok <- ucsc_suffix == ensembl
if (sum(ok) == 0L)
stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
if (!all(ucsc %in% ucsc[ok]))
stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
if (anyDuplicated(ucsc[ok]))
stop(wmsg("failed to disambiguate 'ucsc2ensembl'"))
drop_idx <- idx[!ok]
ucsc2ensembl <- ucsc2ensembl[-drop_idx, , drop=FALSE]
}
ucsc2ensembl
}
.try_to_fetch_ucscToEnsembl <- function(genome,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
message("Trying 'ucscToEnsembl' ... ", appendLF=FALSE)
ucscToEnsembl <- try(
suppressWarnings(
.fetch_ucscToEnsembl_table(genome, goldenPath.url=goldenPath.url)
), silent=TRUE)
if (inherits(ucscToEnsembl, "try-error")) {
message("FAILED! (table does not exist)")
return(NULL)
}
message("OK")
ucscToEnsembl
}
.try_to_fetch_ucsc2ensembl_from_chromAlias <- function(genome,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
message("Trying 'chromAlias' ... ", appendLF=FALSE)
ucsc2ensembl <- try(
suppressWarnings(
.fetch_ucsc2ensembl_from_chromAlias_table(genome,
goldenPath.url=goldenPath.url)
), silent=TRUE)
if (inherits(ucsc2ensembl, "try-error")) {
message("FAILED! (table does not exist)")
return(NULL)
}
if (nrow(ucsc2ensembl) == 0L) {
message("FAILED! (table exists but has no Ensembl information)")
return(NULL)
}
message("OK")
ucsc2ensembl
}
.fetch_Ensembl_column <- function(UCSC_chroms, genome, table=NA,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
if (identical(table, "ucscToEnsembl")) {
ucsc2ensembl <- .fetch_ucscToEnsembl_table(genome,
goldenPath.url=goldenPath.url)
} else if (identical(table, "chromAlias")) {
ucsc2ensembl <- .fetch_ucsc2ensembl_from_chromAlias_table(genome,
goldenPath.url=goldenPath.url)
} else if (identical(table, NA)) {
table <- "ucscToEnsembl"
ucsc2ensembl <- .try_to_fetch_ucscToEnsembl(genome,
goldenPath.url=goldenPath.url)
if (is.null(ucsc2ensembl)) {
table <- "chromAlias"
ucsc2ensembl <- .try_to_fetch_ucsc2ensembl_from_chromAlias(genome,
goldenPath.url=goldenPath.url)
if (is.null(ucsc2ensembl)) {
warning(wmsg("No Ensembl sequence names could be ",
"found for UCSC genome ", genome))
return(rep.int(NA_character_, length(UCSC_chroms)))
}
}
} else {
stop(wmsg("supplied 'table' is not supported"))
}
oo <- match(UCSC_chroms, ucsc2ensembl[ , "ucsc"])
if (is.null(table) && anyNA(oo))
warning(wmsg("incomplete data in '", table, "' table for ",
"UCSC genome ", genome, ": some UCSC sequence ",
"names are missing"))
ucsc2ensembl[oo , "ensembl"]
}
.add_ensembl_column <- function(UCSC_chrom_info, genome, table=NA,
goldenPath.url=getOption("UCSC.goldenPath.url"))
{
ensembl <- .fetch_Ensembl_column(UCSC_chrom_info[ , "chrom"],
genome, table=table,
goldenPath.url=goldenPath.url)
cbind(UCSC_chrom_info, ensembl=ensembl, stringsAsFactors=FALSE)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### registered_UCSC_genomes()
###
.parse_script_for_registered_UCSC_genome <- function(script_path)
{
filename <- basename(script_path)
if (substr(filename, nchar(filename)-1L, nchar(filename)) != ".R")
stop(wmsg("name of genome registration file '", filename, "' must ",
"have extension .R"))
if (grepl(" ", filename, fixed=TRUE))
stop(wmsg("name of genome registration file '", filename, "' must ",
"not contain spaces"))
## Placeholders. Will actually get defined when we source the script.
## See README.TXT in inst/registered/UCSC_genomes/ for the list of
## variables.
GENOME <- ORGANISM <- ASSEMBLED_MOLECULES <- CIRC_SEQS <- NULL
GET_CHROM_SIZES <- NCBI_LINKER <- ENSEMBL_LINKER <- NULL
source(script_path, local=TRUE)
stop_if <- function(notok, ...) {
if (notok)
stop("Error in UCSC genome registration file '", filename,
"':\n ", wmsg(...))
}
## Check GENOME.
stop_if(is.null(GENOME), "'GENOME' must be defined")
stop_if(!isSingleString(GENOME), "'GENOME' must be a single string")
target <- substr(filename, 1L, nchar(filename)-2L)
stop_if(!identical(target, GENOME), "'GENOME' must match filename")
## Check ORGANISM.
stop_if(is.null(ORGANISM), "'ORGANISM' must be defined")
stop_if(!isSingleString(ORGANISM), "'ORGANISM' must be a single string")
stop_if(grepl("_", ORGANISM, fixed=TRUE),
"underscores are not allowed in 'ORGANISM' (use spaces instead)")
## Check ASSEMBLED_MOLECULES.
stop_if(is.null(ASSEMBLED_MOLECULES),
"'ASSEMBLED_MOLECULES' must be defined")
stop_if(!is.character(ASSEMBLED_MOLECULES),
"'ASSEMBLED_MOLECULES' must be a character vector")
stop_if(!is_primary_key(ASSEMBLED_MOLECULES),
"'ASSEMBLED_MOLECULES' must ",
"not contain NAs, empty strings, or duplicates")
## Check CIRC_SEQS.
stop_if(is.null(CIRC_SEQS),
"'CIRC_SEQS' must be defined")
stop_if(!is.character(CIRC_SEQS),
"'CIRC_SEQS' must be a character vector")
stop_if(anyDuplicated(CIRC_SEQS),
"'CIRC_SEQS' must not contain duplicates")
stop_if(!all(CIRC_SEQS %in% ASSEMBLED_MOLECULES),
"'CIRC_SEQS' must be a subset of 'ASSEMBLED_MOLECULES'")
## Check GET_CHROM_SIZES.
if (!is.null(GET_CHROM_SIZES)) {
stop_if(!is.function(GET_CHROM_SIZES),
"when defined, 'GET_CHROM_SIZES' must be a function")
}
## Check NCBI_LINKER.
if (!is.null(NCBI_LINKER)) {
stop_if(!is.list(NCBI_LINKER),
"when defined, 'NCBI_LINKER' must be a named list")
linker_fields <- names(NCBI_LINKER)
stop_if(!is.character(linker_fields),
"when defined, 'NCBI_LINKER' must be a named list")
stop_if(!is_primary_key(linker_fields),
"the names on 'NCBI_LINKER' must ",
"not contain NAs, empty strings, or duplicates")
stop_if(!("assembly_accession" %in% linker_fields),
"'NCBI_LINKER' must have field \"assembly_accession\"")
accession <- NCBI_LINKER$assembly_accession
stop_if(!isSingleString(accession) || accession == "",
"\"assembly_accession\" field in 'NCBI_LINKER' must ",
"be a single non-empty string")
NCBI_assembly_info <- find_NCBI_assembly_info_for_accession(accession)
stop_if(is.null(NCBI_assembly_info),
"\"assembly_accession\" field in 'NCBI_LINKER' must ",
"be associated with a registered NCBI assembly")
stop_if(!identical(ORGANISM, NCBI_assembly_info$organism),
"the NCBI assembly associated with the \"assembly_accession\" ",
"field in 'NCBI_LINKER' is registered for an organism ",
"(\"", NCBI_assembly_info$organism, "\") that differs ",
"from 'ORGANISM' (\"", ORGANISM, "\")")
}
## Check ENSEMBL_LINKER.
if (!is.null(ENSEMBL_LINKER)) {
stop_if(!isSingleString(ENSEMBL_LINKER) || ENSEMBL_LINKER == "",
"when defined, 'ENSEMBL_LINKER' must ",
"be a single non-empty string")
}
list(GENOME=GENOME,
ORGANISM=ORGANISM,
ASSEMBLED_MOLECULES=ASSEMBLED_MOLECULES,
CIRC_SEQS=CIRC_SEQS,
GET_CHROM_SIZES=GET_CHROM_SIZES,
NCBI_LINKER=NCBI_LINKER,
ENSEMBL_LINKER=ENSEMBL_LINKER)
}
registered_UCSC_genomes <- function()
{
dir_path <- system.file("registered", "UCSC_genomes",
package="GenomeInfoDb")
file_paths <- list.files(dir_path, pattern="\\.R$", full.names=TRUE)
assemblies <- lapply(file_paths, .parse_script_for_registered_UCSC_genome)
colnames <- c("organism", "genome", "NCBI_assembly",
"assembly_accession", "with_Ensembl", "circ_seqs")
make_col <- function(j) {
colname <- colnames[[j]]
if (colname == "NCBI_assembly") {
col <- vapply(assemblies,
function(assembly_info) {
linker <- assembly_info$NCBI_LINKER
if (is.null(linker))
return(NA_character_)
accession <- linker$assembly_accession
find_NCBI_assembly_info_for_accession(accession)$assembly
}, character(1), USE.NAMES=FALSE)
return(col)
}
if (colname == "assembly_accession") {
col <- vapply(assemblies,
function(assembly_info) {
linker <- assembly_info$NCBI_LINKER
if (is.null(linker))
return(NA_character_)
linker$assembly_accession
}, character(1), USE.NAMES=FALSE)
return(col)
}
if (colname == "with_Ensembl") {
col <- vapply(assemblies,
function(assembly_info) {
linker <- assembly_info$ENSEMBL_LINKER
!is.null(linker)
}, logical(1), USE.NAMES=FALSE)
return(col)
}
COLNAME <- toupper(colname)
col0 <- lapply(assemblies, `[[`, COLNAME)
if (colname == "circ_seqs")
return(CharacterList(col0))
stopifnot(all(lengths(col0) == 1L))
col <- as.character(unlist(col0, use.names=FALSE))
if (colname == "organism")
col <- factor(col) # order of levels will dictate order
# of rows in final DataFrame
col
}
listData <- lapply(setNames(seq_along(colnames), colnames), make_col)
ans <- S4Vectors:::new_DataFrame(listData, nrows=length(assemblies))
genome_trailing_digits <- sub("(.*[^0-9])([0-9]*)$", "\\2", ans$genome)
oo <- order(ans$organism, as.integer(genome_trailing_digits))
as.data.frame(ans[oo, , drop=FALSE])
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getChromInfoFromUCSC()
###
.UCSC_cached_chrom_info <- new.env(parent=emptyenv())
.get_chrom_info_for_unregistered_UCSC_genome <- function(genome,
assembled.molecules.only=FALSE,
add.ensembl.col=FALSE,
goldenPath.url=getOption("UCSC.goldenPath.url"),
recache=FALSE)
{
ans <- .UCSC_cached_chrom_info[[genome]]
if (is.null(ans) || recache) {
ans <- try(suppressWarnings(
fetch_chrom_sizes_from_UCSC(genome,
goldenPath.url=goldenPath.url)),
silent=TRUE)
if (inherits(ans, "try-error"))
stop(wmsg("unknown UCSC genome: ", genome))
oo <- orderSeqlevels(ans[ , "chrom"])
ans <- S4Vectors:::extract_data_frame_rows(ans, oo)
## Add columns "assembled" and "circular".
assembled <- rep.int(NA, nrow(ans))
circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"])
ans <- cbind(ans, assembled=assembled, circular=circular)
.UCSC_cached_chrom_info[[genome]] <- ans
}
if (isTRUE(add.ensembl.col)) {
warning(wmsg("'add.ensembl.col' got ignored for unregistered ",
"UCSC genome ", genome, "."),
"\n ",
wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
"the \"ensembl\" column anyway."))
} else if (identical(add.ensembl.col, "force")) {
ans <- .add_ensembl_column(ans, genome,
goldenPath.url=goldenPath.url)
}
if (assembled.molecules.only)
warning(wmsg("'assembled.molecules' got ignored for unregistered ",
"UCSC genome ", genome, " (don't know what the ",
"assembled molecules are for an unregistered UCSC ",
"genome)"))
ans
}
.get_NCBI_linker <- function(map.NCBI, linker, GENOME)
{
if (is.list(map.NCBI))
return(map.NCBI)
if (isFALSE(map.NCBI))
return(NULL)
if (is.null(linker))
warning(wmsg("'map.NCBI' got ignored for registered ",
"UCSC genome ", GENOME, " (additional NCBI columns ",
"are only available for registered UCSC genomes ",
"based on an NCBI assembly)"))
linker
}
.get_Ensembl_linker <- function(add.ensembl.col, linker, GENOME)
{
if (identical(add.ensembl.col, "force"))
return(NA)
if (isFALSE(add.ensembl.col))
return(NULL)
if (is.null(linker))
warning(wmsg("'add.ensembl.col' got ignored for registered ",
"UCSC genome ", GENOME, " (genome was registered ",
"with no Ensembl sequence names linked to it)."),
"\n ",
wmsg("Use 'add.ensembl.col=\"force\"' to try adding ",
"the \"ensembl\" column anyway."))
linker
}
.get_chrom_info_for_registered_UCSC_genome <- function(script_path,
assembled.molecules.only=FALSE,
map.NCBI=FALSE,
add.ensembl.col=FALSE,
goldenPath.url=getOption("UCSC.goldenPath.url"),
recache=FALSE)
{
vars <- .parse_script_for_registered_UCSC_genome(script_path)
GENOME <- vars$GENOME
ASSEMBLED_MOLECULES <- vars$ASSEMBLED_MOLECULES
nb_assembled <- length(ASSEMBLED_MOLECULES)
assembled_idx <- seq_len(nb_assembled)
ans <- .UCSC_cached_chrom_info[[GENOME]]
if (is.null(ans) || recache) {
GET_CHROM_SIZES <- vars$GET_CHROM_SIZES
if (is.null(GET_CHROM_SIZES)) {
ans <- fetch_chrom_sizes_from_UCSC(GENOME,
goldenPath.url=goldenPath.url)
stopifnot(nrow(ans) == nb_assembled)
oo <- match(ASSEMBLED_MOLECULES, ans[ , "chrom"])
stopifnot(!anyNA(oo))
ans <- S4Vectors:::extract_data_frame_rows(ans, oo)
} else {
ans <- GET_CHROM_SIZES(goldenPath.url=goldenPath.url)
stopifnot(is.data.frame(ans),
identical(sapply(ans, class),
c(chrom="character", size="integer")),
identical(head(ans[ , "chrom"], n=nb_assembled),
ASSEMBLED_MOLECULES),
is_primary_key(ans[ , "chrom"]))
}
## Add columns "assembled" and "circular".
assembled <- logical(nrow(ans))
assembled[assembled_idx] <- TRUE
circular <- make_circ_flags_from_circ_seqs(ans[ , "chrom"],
vars$CIRC_SEQS)
ans <- cbind(ans, assembled=assembled, circular=circular)
.UCSC_cached_chrom_info[[GENOME]] <- ans
}
## Add NCBI cols.
NCBI_linker <- .get_NCBI_linker(map.NCBI, vars$NCBI_LINKER, GENOME)
if (!is.null(NCBI_linker))
ans <- do.call(.add_NCBI_cols_to_UCSC_chrom_info,
c(list(ans), NCBI_linker))
## Add Ensembl col.
Ensemb_linker <- .get_Ensembl_linker(add.ensembl.col, vars$ENSEMBL_LINKER,
GENOME)
if (!is.null(Ensemb_linker))
ans <- .add_ensembl_column(ans, GENOME, table=Ensemb_linker,
goldenPath.url=goldenPath.url)
if (assembled.molecules.only)
ans <- S4Vectors:::extract_data_frame_rows(ans, assembled_idx)
if (!is.null(NCBI_linker)) {
assembly_accession <- NCBI_linker$assembly_accession
NCBI_assembly_info <-
find_NCBI_assembly_info_for_accession(assembly_accession)
attr(ans, "NCBI_assembly_info") <- NCBI_assembly_info
}
ans
}
### Return a 4-column data.frame with columns "chrom" (character), "size"
### (integer), "assembled" (logical), and "circular" (logical).
getChromInfoFromUCSC <- function(genome,
assembled.molecules.only=FALSE,
map.NCBI=FALSE,
add.ensembl.col=FALSE,
goldenPath.url=getOption("UCSC.goldenPath.url"),
recache=FALSE,
as.Seqinfo=FALSE)
{
if (!isSingleString(genome))
stop(wmsg("'genome' must be a single string"))
if (!isTRUEorFALSE(assembled.molecules.only))
stop(wmsg("'assembled.molecules.only' must be TRUE or FALSE"))
if (!(isTRUEorFALSE(map.NCBI) ||
is.list(map.NCBI) && !is.null(names(map.NCBI))))
stop(wmsg("'map.NCBI' must be TRUE or FALSE ",
"(experts can also supply an \"NCBI linker\" ",
"as a named list)"))
if (!(isTRUEorFALSE(add.ensembl.col) ||
identical(add.ensembl.col, "force")))
stop(wmsg("'add.ensembl.col' must be TRUE or FALSE ",
"(experts can also use \"force\")"))
if (!isSingleString(goldenPath.url))
stop(wmsg("'goldenPath.url' must be a single string"))
if (!isTRUEorFALSE(recache))
stop(wmsg("'recache' must be TRUE or FALSE"))
if (!isTRUEorFALSE(as.Seqinfo))
stop(wmsg("'as.Seqinfo' must be TRUE or FALSE"))
script_name <- paste0(genome, ".R")
script_path <- system.file("registered", "UCSC_genomes", script_name,
package="GenomeInfoDb")
if (identical(script_path, "")) {
if (!isFALSE(map.NCBI))
warning(wmsg("'map.NCBI' got ignored for unregistered ",
"UCSC genome ", genome, " (additional NCBI columns ",
"are only available for registered UCSC genomes ",
"based on an NCBI assembly)"))
ans <- .get_chrom_info_for_unregistered_UCSC_genome(genome,
assembled.molecules.only=assembled.molecules.only,
add.ensembl.col=add.ensembl.col,
goldenPath.url=goldenPath.url,
recache=recache)
} else {
ans <- .get_chrom_info_for_registered_UCSC_genome(script_path,
assembled.molecules.only=assembled.molecules.only,
map.NCBI=map.NCBI,
add.ensembl.col=add.ensembl.col,
goldenPath.url=goldenPath.url,
recache=recache)
}
if (!as.Seqinfo)
return(ans)
Seqinfo(seqnames=ans[ , "chrom"],
seqlengths=ans[ , "size"],
isCircular=ans[ , "circular"],
genome=genome)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.