R/BSgenome-class.R
In BSgenome: Software infrastructure for efficient representation of full genomes and their SNPs

Documented in BSgenome BSgenome

### =========================================================================
### BSgenome objects
### -------------------------------------------------------------------------


setClass("BSgenome",
    ## Expected metadata data are:
    ## - organism
    ## - common_name
    ## - genome
    ## - provider
    ## - release_date
    ## - source_url: permanent URL to the place where the 2bit and/or FASTA
    ##   files used to produce the "single" and "multiple" sequences can be
    ##   found (and downloaded).
    contains="Annotated",
    representation(
        ## Name of the BSgenome data package where the BSgenome object is
        ## defined.
        pkgname="character",

        ## On-disk "single" and "multiple" sequences.
        single_sequences="OnDiskNamedSequences",
        multiple_sequences="RdaCollection",

        ## Original seqinfo.
        seqinfo="Seqinfo",

        ## Named vector representing the translation table from the original
        ## seqnames (as stored in self@seqinfo@seqnames) to the user seqnames.
        user_seqnames="character",

        ## For SNPs injection.
        injectSNPs_handler="InjectSNPsHandler",

        ## Used for caching the single and multiple on-disk sequences.
        .seqs_cache="environment",
        .link_counts="environment"
    )
)

setClass("MaskedBSgenome",
    contains="BSgenome",
    representation(
        ## Name of the BSgenome data package where the MaskedBSgenome object
        ## is defined.
        masks_pkgname="character",

        ## Only the single sequences can be masked. They should all have the
        ## same set of masks.
        nmask_per_seq="integer",
        masks="RdaCollection"
    )
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Low-level helper functions used for delayed-loading/caching/unloading the
### sequences.
###

### Return a new link to a cached object.
### 'objname' is the name of the cached object.
### 'cache' is the caching environment.
### 'link_counts' is the environment where we keep track of the number of links
### for each cached object. When the number of links for a given cached object
### reaches 0, then it is removed from the cache.
.newLinkToCachedObject <- function(objname, cache, link_counts)
{
    ans <- new.env(parent=emptyenv())
    if (exists(objname, envir=link_counts, inherits=FALSE))
        link_count0 <- get(objname, envir=link_counts, inherits=FALSE) + 1L
    else
        link_count0 <- 1L
    reg.finalizer(ans,
        function(e)
        {
            link_count <- get(objname, envir=link_counts, inherits=FALSE) - 1L
            assign(objname, link_count, envir=link_counts)
            if (link_count == 0) {
                if (getOption("verbose"))
                    cat("uncaching ", objname, "\n", sep="")
                remove(list=objname, envir=cache)
            }
        }
    )
    assign(objname, link_count0, envir=link_counts)
    ans
}

setGeneric(".linkToCachedObject<-", signature="x",
    function(x, value) standardGeneric(".linkToCachedObject<-")
)

setReplaceMethod(".linkToCachedObject", "SharedVector",
    function(x, value)
    {
        x@.link_to_cached_object <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "XString",
    function(x, value)
    {
        .linkToCachedObject(x@shared) <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "MaskedXString",
    function(x, value)
    {
        .linkToCachedObject(x@unmasked) <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "XStringSet",
    function(x, value)
    {
        ### This means that an XStringSet object with a "pool" slot of
        ### length != 1 will be permanently cached.
        if (length(x@pool) == 1L)
            x@pool@.link_to_cached_object_list[[1L]] <- value
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The names of the built-in masks.
###

BUILTIN_MASKNAMES <- c("AGAPS", "AMB", "RM", "TRF")


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Accessors
###

### Metadata.
setMethod("organism", "BSgenome",
    function(object) metadata(object)$organism
)
setMethod("commonName", "BSgenome",
    function(object) metadata(object)$common_name
)
setMethod("provider", "BSgenome", function(x) metadata(x)$provider)
setMethod("providerVersion", "BSgenome",
    function(x)
    {
        msg <- c("Using providerVersion() on a ", class(x), " object ",
                 "is deprecated. Please use 'metadata(x)$genome' instead.")
        .Deprecated(msg=c("  ", wmsg(msg)))
        metadata(x)$genome
    }
)
setMethod("releaseDate", "BSgenome", function(x) metadata(x)$release_date)
setMethod("releaseName", "BSgenome",
    function(x)
    {
        msg <- c("Starting with Bioconductor 3.12, BSgenome objects no ",
                 "longer have a \"release name\". As a consequence of ",
                 "this change calling releaseName() on a BSgenome object ",
                 "now always returns NA and is deprecated.")
        .Deprecated(msg=c("  ", wmsg(msg)))
        NA_character_
    }
)
setGeneric("sourceUrl", function(x) standardGeneric("sourceUrl"))
setMethod("sourceUrl", "BSgenome", function(x) metadata(x)$source_url)

setMethod("length", "BSgenome", function(x) length(names(x)))

setGeneric("mseqnames", function(x) standardGeneric("mseqnames"))
setMethod("mseqnames", "BSgenome",
    function(x)
    {
        ans <- names(x@multiple_sequences)
        if (length(ans) == 0L)
            ans <- NULL
        ans
    }
)

setMethod("names", "BSgenome", function(x) c(seqnames(x), mseqnames(x)))

setGeneric("masknames", function(x) standardGeneric("masknames"))

setMethod("masknames", "BSgenome", function(x) NULL)

setMethod("masknames", "MaskedBSgenome",
    function(x)
    {
        ## TODO: Put this kind of checking in a validity method for
        ## MaskedBSgenome objects (that's what validity methods are for).
        if (x@nmask_per_seq > length(BUILTIN_MASKNAMES))
            stop("internal anomaly: x@nmask_per_seq > ",
                 length(BUILTIN_MASKNAMES))
        BUILTIN_MASKNAMES[seq_len(x@nmask_per_seq)]
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### seqinfo() accessor and related.
###

setMethod("seqnames", "BSgenome", function(x) unname(x@user_seqnames))

setMethod("seqinfo", "BSgenome",
    function(x)
    {
        ans <- x@seqinfo
        seqlevels(ans) <- seqnames(x)
        ans
    }
)

### We implement a restricted seqinfo() setter for BSgenome object 'x' that
### supports altering **only** the seqlevels and/or genome of 'seqinfo(x)'.
### It does NOT allow subsetting 'seqinfo(x)' (by dropping/reordering some
### of its seqlevels), or altering its seqlengths or circularity flags!
### In other words, except for their seqnames() and genome(), Seqinfo
### objects 'new_seqinfo' and 'old_seqinfo' must be identical. This is all
### we need to make the seqlevelsStyle() setter work on a BSgenome object.
.check_new2old_and_new_seqinfo <-
    function(new2old, new_seqinfo, old_seqinfo, context="")
{
    if (length(new_seqinfo) != length(old_seqinfo))
        stop(wmsg("the supplied 'seqinfo' must have the same ",
                  "length as the current 'seqinfo'", context))
    if (!(is.null(new2old) || identical(new2old, seq_along(new_seqinfo))))
        stop(wmsg("'new2old' can only be set to NULL or ",
                  "'seq_along(seqinfo(x))'", context))
    seqnames(old_seqinfo) <- seqnames(new_seqinfo)
    genome(old_seqinfo) <- genome(new_seqinfo)
    if (!identical(new_seqinfo, old_seqinfo))
        stop(wmsg("seqlengths() and isCircular() of the supplied 'seqinfo' ",
                  "must be identical to seqlengths() and isCircular() of ",
                  "the current 'seqinfo'", context))
}

.set_BSgenome_seqinfo <-
    function(x, new2old=NULL,
                pruning.mode=c("error", "coarse", "fine", "tidy"),
                value)
{
    if (!is(value, "Seqinfo"))
        stop(wmsg("the supplied 'seqinfo' must be a Seqinfo object"))
    context <- paste0(" when replacing the 'seqinfo' of a ",
                      classNameForDisplay(x), " object")
    pruning.mode <- match.arg(pruning.mode)
    if (pruning.mode != "error")
        stop(wmsg("'pruning.mode' is not supported", context))
    .check_new2old_and_new_seqinfo(new2old, value, seqinfo(x), context)
    new_seqnames <- seqnames(value)
    if (any(new_seqnames %in% mseqnames(x)))
        stop(wmsg("the supplied 'seqnames' cannot match any of the ",
                  "multiple sequence names (as returned by 'mseqnames(x)')"))
    x@user_seqnames[] <- new_seqnames  # using [] to preserve the names
    genome(x@seqinfo) <- unname(genome(value))
    x
}

setReplaceMethod("seqinfo", "BSgenome", .set_BSgenome_seqinfo)

setReplaceMethod("seqnames", "BSgenome",
    function(x, value)
    {
        seqnames(seqinfo(x)) <- value
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SNP related accessors
###

setMethod("SNPlocs_pkgname", "BSgenome",
    function(x) SNPlocs_pkgname(x@injectSNPs_handler)
)

setMethod("snpcount", "BSgenome",
    function(x) snpcount(x@injectSNPs_handler)
)

setMethod("snplocs", "BSgenome",
    function(x, seqname, ...) snplocs(x@injectSNPs_handler, seqname, ...)
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### BSgenome constructor
###

### 'seqnames' only used for sanity check.
.make_BSgenome_seqinfo <- function(single_sequences, circ_seqs,
                                   genome, seqnames)
{
    seqlengths <- seqlengths(single_sequences)
    if (length(seqnames) == 0L) {
        seqnames <- names(seqlengths)
    } else if (!identical(names(seqlengths), seqnames)) {
        stop("sequence names found in file '",
             seqlengthsFilepath(single_sequences),
             "' are not identical to 'seqnames'. ",
             "May be the data on disk is corrupted?")
    }
    if (identical(circ_seqs, NA)) {
        is_circ <- NA
    } else {
        is_circ <- seqnames %in% circ_seqs
    }
    Seqinfo(seqnames=seqnames,
            seqlengths=seqlengths,
            isCircular=is_circ,
            genome=genome)
}

### NOTES:
### - In BioC 2.14, the 'seqs_pkgname' BSgenome slot was renamed 'pkgname'
###   but the corresponding argument was not renamed for backward
###   compatibility with existing BSgenome packages.
### - In BioC 3.1, the 'species' argument was replaced with the 'common_name'
###   argument but the former was kept for backward compatibility with
###   existing BSgenome packages.
### - In BioC 3.12, the 'provider_version' argument was replaced with the
###   'genome' argument, and the 'release_name' became ignored and was only
###   kept for backward compatibility.
BSgenome <- function(organism, common_name, genome,
                     provider, provider_version,
                     release_date, release_name, source_url,
                     seqnames, circ_seqs=NA, mseqnames,
                     seqs_pkgname, seqs_dirpath,
                     species=NA_character_)
{
    single_sequences <- OnDiskNamedSequences(seqs_dirpath, seqnames=seqnames)
    if (missing(genome))
        genome <- provider_version
    seqinfo <- .make_BSgenome_seqinfo(single_sequences, circ_seqs,
                                      genome, seqnames)
    seqnames <- seqnames(seqinfo)
    if (missing(common_name))
        common_name <- species
    metadata <- list(organism=organism,
                     common_name=common_name,
                     genome=genome,
                     provider=provider,
                     release_date=release_date,
                     source_url=source_url)
    if (is.null(mseqnames))
        mseqnames <- character(0)
    multiple_sequences <- RdaCollection(seqs_dirpath, mseqnames)
    names(user_seqnames) <- user_seqnames <- seqnames

    new("BSgenome", metadata=metadata,
                    pkgname=seqs_pkgname,
                    single_sequences=single_sequences,
                    multiple_sequences=multiple_sequences,
                    seqinfo=seqinfo,
                    user_seqnames=user_seqnames,
                    .seqs_cache=new.env(parent=emptyenv()),
                    .link_counts=new.env(parent=emptyenv())
    )
}

MaskedBSgenome <- function(ref_bsgenome,
                           masks_pkgname, nmask_per_seq, masks_dirpath)
{
    masks <- RdaCollection(masks_dirpath,
                           paste0(seqnames(ref_bsgenome), ".masks"))
    new("MaskedBSgenome", ref_bsgenome,
        .seqs_cache=new.env(parent=emptyenv()),
        .link_counts=new.env(parent=emptyenv()),
        masks_pkgname=masks_pkgname,
        nmask_per_seq=as.integer(nmask_per_seq),
        masks=masks) 
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Coercion
###

setMethod("as.list", "BSgenome",
    function(x)
        lapply(setNames(seqnames(x), seqnames(x)),
               function(seqname) x[[seqname]])
)

setAs("BSgenome", "GenomeDescription",
    function(from)
    {
        metadata <- metadata(from)
        GenomeDescription(organism=metadata$organism,
                          common_name=metadata$common_name,
                          provider=metadata$provider,
                          provider_version=metadata$genome,
                          release_date=metadata$release_date,
                          release_name=NA_character_,
                          seqinfo=seqinfo(from))
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### bsgenomeName()
###

setMethod("bsgenomeName", "BSgenome",
    function(x) bsgenomeName(as(x, "GenomeDescription"))
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "show" method
###

.print_BSgenome_metadata <- function(metadata, prefix="")
{
    common_name <- metadata$common_name
    if (!is.na(common_name))
        cat(common_name, "genome:\n")
    cat(prefix, "organism: ", metadata$organism,
                " (", common_name, ")\n", sep="")
    cat(prefix, "genome: ", metadata$genome, "\n", sep="")
    cat(prefix, "provider: ", metadata$provider, "\n", sep="")
    cat(prefix, "release date: ", metadata$release_date, "\n", sep="")
}

.SHOW_BSGENOME_PREFIX <- "# "
.SHOW_SEQSECTION_PREFIX <- "#   "

setMethod("show", "BSgenome",
    function(object)
    {
        mystrwrap <- function(line)
            writeLines(strwrap(line, width=getOption("width")+1,
                               exdent=0L, prefix=.SHOW_BSGENOME_PREFIX))
        .print_BSgenome_metadata(metadata(object), prefix=.SHOW_BSGENOME_PREFIX)
        if (!is.null(SNPlocs_pkgname(object)))
            cat(.SHOW_BSGENOME_PREFIX, "with SNPs injected from package: ", SNPlocs_pkgname(object), "\n", sep="")
        if (length(mseqnames(object)) == 0L) {
            what <- "sequence"
        } else {
            cat(.SHOW_BSGENOME_PREFIX, "\n", sep="")
            what <- "single sequence"
        }
        mystrwrap(paste0(length(seqnames(object)), " ", what, "s:"))
        if (length(seqnames(object)) != 0L) {
            printAtomicVectorInAGrid(seqnames(object),
                                     prefix=.SHOW_SEQSECTION_PREFIX)
            mystrwrap(paste0("(use 'seqnames()' to see all the ",
                             what, " names, ",
                             "use the '$' or '[[' operator to access a given ",
                             what, ")"))
        } else {
            cat(.SHOW_SEQSECTION_PREFIX, "NONE\n", sep="")
        }
        if (length(mseqnames(object)) != 0L) {
            cat(.SHOW_BSGENOME_PREFIX, "\n", sep="")
            mystrwrap("multiple sequences:")
            printAtomicVectorInAGrid(mseqnames(object),
                                     prefix=.SHOW_SEQSECTION_PREFIX)
            mystrwrap(paste0("(use 'mseqnames()' to see all the ",
                             "multiple sequence names, ",
                             "use the '$' or '[[' operator to access a given ",
                             "multiple sequence)"))
        }
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### List-element extraction (with [[).
###

.loadBSgenomeSequence <- function(x, seqname, user_seqname)
{
    idx <- match(user_seqname, x@user_seqnames)
    if (is.na(idx)) {  # multiple sequence
        if (substr(seqname, 1L, 8L) == "upstream") {
            msg <- c(
                "  Starting with BioC 3.0, the upstream sequences ",
                "are defunct.\n",
                "  However they can easily be extracted ",
                "from the full genome\n  sequences with something like ",
                "(for example for hg19):\n\n",
                "      library(TxDb.Hsapiens.UCSC.hg19.knownGene)\n",
                "      txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene\n",
                "      gn <- sort(genes(txdb))\n",
                "      up1000 <- flank(gn, width=1000)\n",
                "      library(BSgenome.Hsapiens.UCSC.hg19)\n",
                "      genome <- BSgenome.Hsapiens.UCSC.hg19\n",
                "      up1000seqs <- getSeq(genome, up1000)\n\n",
                "  IMPORTANT: Make sure you use a TxDb package (or ",
                "TranscriptDb object)\n  that contains a gene model ",
                "based on the exact same reference genome\n",
                "  as the BSgenome object you pass to getSeq(). Note that ",
                "you can make\n  your own custom TranscriptDb object from ",
                "various annotation resources.\n",
                "  See the makeTxDbFromUCSC(), ",
                "makeTxDbFromBiomart(), and\n",
                "  makeTxDbFromGFF() functions in the ",
                "GenomicFeatures package."
            )
            .Defunct(msg=paste0(msg, collapse=""))
        }
        ans <- x@multiple_sequences[[seqname]]
        return(ans)
    }
    # single sequence
    ans <- getListElement(x@single_sequences, seqname)
    ## Check the length of the sequence
    if (length(ans) != seqlengths(x)[[user_seqname]]) {
        stop(user_seqname, " sequence does not have the expected length. ",
             "May be the data on disk is corrupted?")
    }
    ## Inject the SNPs, if any
    snps <- snplocs(x, seqname)
    if (!is.null(snps))
        .inplaceReplaceLetterAt(ans, snps$loc, snps$alleles_as_ambig)
    ## Load and set the built-in masks, if any
    nmask_per_seq <- length(masknames(x))
    if (nmask_per_seq > 0L) {
        masks_objname <- paste0(seqname, ".masks")
        builtinmasks <- x@masks[[masks_objname]]
        if (length(builtinmasks) < nmask_per_seq) {
            masks_path <- rdaPath(x@masks, masks_objname)
            stop("expecting ", nmask_per_seq, " built-in masks per ",
                 "single sequence, found only ", length(builtinmasks),
                 " in file '", masks_path, "'. ",
                 "May be the data on disk is corrupted?")
        }
        if (length(builtinmasks) > nmask_per_seq)
            builtinmasks <- builtinmasks[seq_len(nmask_per_seq)]
        if (!identical(names(builtinmasks), masknames(x))) {
            masks_path <- rdaPath(x@masks, masks_objname)
            stop("mask names found in file '", masks_path, "' are not ",
                 "identical to the names returned by masknames(). ",
                 "May be the data on disk is corrupted?")
        }
        masks(ans) <- builtinmasks
    }
    ans
}

.getBSgenomeSequence <- function(x, seqname, user_seqname)
{
    seqs_cache <- x@.seqs_cache
    ## Using the 'if (!exists()) assign(); get()' approach is NOT 100%
    ## reliable:
    ##
    ##   if (!exists(seqname, envir=seqs_cache, inherits=FALSE)) {
    ##       ...
    ##       assign(seqname, ans, envir=seqs_cache)
    ##   }
    ##   get(seqname, envir=seqs_cache, inherits=FALSE)
    ##
    ## because the symbol (seqname) can disappear from the cache between
    ## the moment we test for its presence and the moment we try to get it.
    ## It's not me being paranoid, we've seen this happen! One possible
    ## explanation for this is that the symbol was candidate for removal
    ## from the cache but that removal didn't happen yet because gc() had
    ## not yet been called (removal from the cache is implemented thru the
    ## finalizers registered on the objects that are copied from the cache
    ## and made available to the user). Then the call to get() would trigger
    ## garbbage collection and that in turn would trigger the removal of
    ## the symbol *before* get() had a chance to get to it. So now we use
    ## the 'try(get(...))' approach, which hopefully is 100% reliable!
    ans <- try(get(seqname, envir=seqs_cache, inherits=FALSE), silent=TRUE)
    if (is(ans, "try-error")) {
        ans <- .loadBSgenomeSequence(x, seqname, user_seqname)
        if (getOption("verbose"))
            cat("caching ", seqname, "\n", sep="")
        assign(seqname, ans, envir=seqs_cache)
    }
    .linkToCachedObject(ans) <- .newLinkToCachedObject(
                                    seqname,
                                    seqs_cache,
                                    x@.link_counts)
    ans
}

setMethod("[[", "BSgenome",
    function(x, i, j, ...)
    {
        ## 'x' is guaranteed to be a "BSgenome" object (if it's not, then the
        ## method dispatch algo will not even call this method), so nargs() is
        ## guaranteed to be >= 1
        if (nargs() >= 3L)
            stop("too many subscripts")
        subscripts <- list(...)
        if (!missing(i))
            subscripts$i <- i
        if (!missing(j))
            subscripts$j <- j
        ## At this point, 'subscripts' should be guaranteed
        ## to be of length <= 1
        if (length(subscripts) == 0L)
            stop("no index specified")
        i <- subscripts[[1L]]
        if (length(i) < 1L)
            stop("attempt to select less than one element")
        if (length(i) > 1L)
            stop("attempt to select more than one element")
        if (is.character(i)) {
            user_seqname <- try(match.arg(i, names(x)), silent=TRUE)
            if (is(user_seqname, "try-error"))
                stop("no such sequence")
        } else {
            if (!is.numeric(i) || is.na(i))
                stop("no such sequence")
            i <- as.integer(i)
            if (i < 1L || length(x) < i)
                stop("no such sequence")
            user_seqname <- names(x)[i]
        }
        ## Translate user-supplied sequence named back to original name.
        idx <- match(user_seqname, x@user_seqnames)
        if (is.na(idx)) {
            seqname <- user_seqname  # multiple sequence
        } else {
            seqname <- names(x@user_seqnames)[[idx]]  # single sequence
        }
        .getBSgenomeSequence(x, seqname, user_seqname)
    }
)

setMethod("$", "BSgenome",
    function(x, name) x[[name]]
)
Any scripts or data that you put into this service are public.
BSgenome documentation built on Nov. 8, 2020, 7:48 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BSgenome
Software infrastructure for efficient representation of full genomes and their SNPs

R/BSgenome-class.R
In BSgenome: Software infrastructure for efficient representation of full genomes and their SNPs

Defines functions .getBSgenomeSequence .loadBSgenomeSequence .print_BSgenome_metadata MaskedBSgenome BSgenome .make_BSgenome_seqinfo .set_BSgenome_seqinfo .check_new2old_and_new_seqinfo .newLinkToCachedObject

Documented in BSgenome BSgenome

Try the BSgenome package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BSgenome Software infrastructure for efficient representation of full genomes and their SNPs

R/BSgenome-class.R In BSgenome: Software infrastructure for efficient representation of full genomes and their SNPs

Defines functions .getBSgenomeSequence .loadBSgenomeSequence .print_BSgenome_metadata MaskedBSgenome BSgenome .make_BSgenome_seqinfo .set_BSgenome_seqinfo .check_new2old_and_new_seqinfo .newLinkToCachedObject

Documented in BSgenome BSgenome

Try the BSgenome package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BSgenome
Software infrastructure for efficient representation of full genomes and their SNPs

R/BSgenome-class.R
In BSgenome: Software infrastructure for efficient representation of full genomes and their SNPs