R/BSgenome-class.R

Defines functions .getBSgenomeSequence .loadBSgenomeSequence MaskedBSgenome BSgenome .makeBSgenomeSeqinfo .newLinkToCachedObject

Documented in BSgenome BSgenome

### =========================================================================
### BSgenome objects
### -------------------------------------------------------------------------


setClass("BSgenome",
    contains="GenomeDescription",
    representation(
        ## Name of the BSgenome data package where the BSgenome object is
        ## defined.
        pkgname="character",

        ## On-disk "single" and "multiple" sequences.
        single_sequences="OnDiskNamedSequences",
        multiple_sequences="RdaCollection",

        ## Permanent URL to the place where the FASTA files that were used
        ## to produce the "single" and "multiple" sequences above can be
        ## found (and downloaded).
        source_url="character",

        ## Named vector representing the translation table from the original
        ## seqnames (as stored in [email protected]@seqnames, the 'seqinfo' slot
        ## being inherited from the GenomeDescription class) to the user
        ## seqnames.
        user_seqnames="character",

        ## For SNPs injection.
        injectSNPs_handler="InjectSNPsHandler",

        ## Used for caching the single and multiple on-disk sequences.
        .seqs_cache="environment",
        .link_counts="environment",

        ## TODO: Remove the 2 slots below once all the BSgenome data packages
        ## with masks have been split in 2 packages (naked BSgenome +
        ## MaskedBSgenome).
        nmask_per_seq="integer",
        masks="RdaCollection"
    )
)

setClass("MaskedBSgenome",
    contains="BSgenome",
    representation(
        ## Name of the BSgenome data package where the MaskedBSgenome object
        ## is defined.
        masks_pkgname="character",

        ## Only the single sequences can be masked. They should all have the
        ## same set of masks.
        nmask_per_seq="integer",
        masks="RdaCollection"
    )
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Low-level helper functions used for delayed-loading/caching/unloading the
### sequences.
###

### Return a new link to a cached object.
### 'objname' is the name of the cached object.
### 'cache' is the caching environment.
### 'link_counts' is the environment where we keep track of the number of links
### for each cached object. When the number of links for a given cached object
### reaches 0, then it is removed from the cache.
.newLinkToCachedObject <- function(objname, cache, link_counts)
{
    ans <- new.env(parent=emptyenv())
    if (exists(objname, envir=link_counts, inherits=FALSE))
        link_count0 <- get(objname, envir=link_counts, inherits=FALSE) + 1L
    else
        link_count0 <- 1L
    reg.finalizer(ans,
        function(e)
        {
            link_count <- get(objname, envir=link_counts, inherits=FALSE) - 1L
            assign(objname, link_count, envir=link_counts)
            if (link_count == 0) {
                if (getOption("verbose"))
                    cat("uncaching ", objname, "\n", sep="")
                remove(list=objname, envir=cache)
            }
        }
    )
    assign(objname, link_count0, envir=link_counts)
    ans
}

setGeneric(".linkToCachedObject<-", signature="x",
    function(x, value) standardGeneric(".linkToCachedObject<-")
)

setReplaceMethod(".linkToCachedObject", "SharedVector",
    function(x, value)
    {
        x@.link_to_cached_object <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "XString",
    function(x, value)
    {
        .linkToCachedObject(x@shared) <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "MaskedXString",
    function(x, value)
    {
        .linkToCachedObject(x@unmasked) <- value
        x
    }
)

setReplaceMethod(".linkToCachedObject", "XStringSet",
    function(x, value)
    {
        ### This means that an XStringSet object with a "pool" slot of
        ### length != 1 will be permanently cached.
        if (length(x@pool) == 1L)
            x@pool@.link_to_cached_object_list[[1L]] <- value
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The names of the built-in masks.
###

BUILTIN_MASKNAMES <- c("AGAPS", "AMB", "RM", "TRF")


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Accessors.
###

setMethod("length", "BSgenome", function(x) length(names(x)))

setGeneric("sourceUrl", function(x) standardGeneric("sourceUrl"))
setMethod("sourceUrl", "BSgenome", function(x) x@source_url)

setGeneric("mseqnames", function(x) standardGeneric("mseqnames"))
setMethod("mseqnames", "BSgenome",
    function(x)
    {
        ans <- names(x@multiple_sequences)
        if (length(ans) == 0L)
            ans <- NULL
        ans
    }
)

setMethod("names", "BSgenome", function(x) c(seqnames(x), mseqnames(x)))

setGeneric("masknames", function(x) standardGeneric("masknames"))
setMethod("masknames", "BSgenome",
    function(x)
    {
        if (x@nmask_per_seq == 0L)
            return(NULL)
        ## TODO: Put this kind of checking in a validity method for BSgenome
        ## objects (that's what validity methods are for).
        if (x@nmask_per_seq > length(BUILTIN_MASKNAMES))
            stop("internal anomaly: [email protected]_per_seq > ",
                 length(BUILTIN_MASKNAMES))
        BUILTIN_MASKNAMES[seq_len(x@nmask_per_seq)]
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### seqinfo() accessor and related.
###

setMethod("seqinfo", "BSgenome",
    function(x)
    {
        ans <- x@seqinfo
        seqlevels(ans) <- x@user_seqnames
        ans
    }
)

### This is a restricted "seqinfo<-" method for BSgenome objects that
### only supports replacement of the sequence names, i.e., except for their
### sequence names, Seqinfo objects 'value' and 'seqinfo(x)' must be identical.
setReplaceMethod("seqinfo", "BSgenome",
    function(x, new2old=NULL,
             pruning.mode=c("error", "coarse", "fine", "tidy"),
             value)
    {
        if (!is(value, "Seqinfo"))
            stop("the supplied 'seqinfo' must be a Seqinfo object")
        IN_THIS_CONTEXT <- paste0("when replacing the 'seqinfo' ",
                                  "of a BSgenome object")
        pruning.mode <- match.arg(pruning.mode)
        if (pruning.mode != "error")
            stop("'pruning.mode' not supported ", IN_THIS_CONTEXT)
        x_seqinfo <- seqinfo(x)
        if (is.null(new2old)) {
            ## Support no-op seqinfo(x) <- seqinfo(x).
            if (!identical(value, x_seqinfo))
                stop("'new2old' must be specified ", IN_THIS_CONTEXT)
            return(x)
        }
        if (length(value) != length(x_seqinfo))
            stop("the supplied 'seqinfo' must have the same length ",
                 "as the current 'seqinfo' ", IN_THIS_CONTEXT)
        if (!identical(new2old, seq_along(value)))
            stop("'new2old' must be NULL or equal to 'seq_along(value)' ",
                 IN_THIS_CONTEXT)
        new_seqnames <- seqnames(value)
        seqnames(x_seqinfo) <- new_seqnames
        if (!identical(value, x_seqinfo))
            stop("the supplied and current 'seqinfo' can differ only ",
                 "in their sequence names ", IN_THIS_CONTEXT)
        if (any(new_seqnames %in% mseqnames(x)))
            stop("the supplied 'seqnames' cannot match any of the ",
                 "multiple sequence names (as returned by 'mseqnames(x)')")
        x@user_seqnames[] <- new_seqnames  # using [] to preserve the names
        x
    }
)

setReplaceMethod("seqnames", "BSgenome",
    function(x, value)
    {
        x_seqinfo <- seqinfo(x)
        seqnames(x_seqinfo) <- value
        seqinfo(x, new2old=seq_along(x_seqinfo)) <- x_seqinfo
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SNP related accessors
###

setMethod("SNPlocs_pkgname", "BSgenome",
    function(x) SNPlocs_pkgname(x@injectSNPs_handler)
)

setMethod("snpcount", "BSgenome",
    function(x) snpcount(x@injectSNPs_handler)
)

setMethod("snplocs", "BSgenome",
    function(x, seqname, ...) snplocs(x@injectSNPs_handler, seqname, ...)
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### BSgenome constructor
###

### 'seqnames' only used for sanity check.
.makeBSgenomeSeqinfo <- function(single_sequences,
                                 circ_seqs, provider_version,
                                 seqnames)
{
    seqlengths <- seqlengths(single_sequences)
    if (length(seqnames) == 0L) {
        seqnames <- names(seqlengths)
    } else if (!identical(names(seqlengths), seqnames)) {
        stop("sequence names found in file '",
             seqlengthsFilepath(single_sequences),
             "' are not identical to 'seqnames'. ",
             "May be the data on disk is corrupted?")
    }
    if (identical(circ_seqs, NA)) {
        is_circ <- NA
    } else {
        is_circ <- seqnames %in% circ_seqs
    }
    Seqinfo(seqnames=seqnames,
            seqlengths=seqlengths,
            isCircular=is_circ,
            genome=provider_version)
}

### NOTES:
### - In BioC 2.14, the 'seqs_pkgname' and 'masks_pkgname' BSgenome slots
###   were replaced with the 'pkgname' slot but the 2 corresponding
###   arguments were kept for backward compatibility with existing
###   BSgenome packages.
### - In BioC 3.1, the 'species' argument was replaced with the 'common_name'
###   argument but the former was kept for backward compatibility with
###   existing BSgenome packages.
BSgenome <- function(organism, common_name, provider, provider_version,
                     release_date, release_name, source_url,
                     seqnames, circ_seqs=NA, mseqnames,
                     seqs_pkgname, seqs_dirpath,
                     nmask_per_seq=0, masks_pkgname=NA, masks_dirpath=NA,
                     species=NA_character_)
{
    twobit_filename <- "single_sequences.2bit"
    twobit_filepath <- file.path(seqs_dirpath, twobit_filename)
    if (file.exists(twobit_filepath)) {
        single_sequences <- TwobitNamedSequences(twobit_filepath)
    } else {
        fa_filename <- "single_sequences.fa"
        fa_filepath <- file.path(seqs_dirpath, fa_filename)
        farz_filename <- paste0(fa_filename, ".rz")
        farz_filepath <- file.path(seqs_dirpath, farz_filename)
        if (file.exists(farz_filepath)) {
            single_sequences <- FastaNamedSequences(farz_filepath)
        } else if (file.exists(fa_filepath)) {
            single_sequences <- FastaNamedSequences(fa_filepath)
        } else {
            single_sequences <- RdaNamedSequences(seqs_dirpath, seqnames)
        }
    }
    seqinfo <- .makeBSgenomeSeqinfo(single_sequences,
                                    circ_seqs, provider_version,
                                    seqnames)
    seqnames <- seqnames(seqinfo)
    if (missing(common_name))
        common_name <- species
    genome_description <- GenomeDescription(organism, common_name,
                                            provider, provider_version,
                                            release_date, release_name,
                                            seqinfo)
    if (is.null(mseqnames))
        mseqnames <- character(0)
    multiple_sequences <- RdaCollection(seqs_dirpath, mseqnames)
    names(user_seqnames) <- user_seqnames <- seqnames
    nmask_per_seq <- as.integer(nmask_per_seq)
    masks_dirpath <- as.character(masks_dirpath)
    if (nmask_per_seq == 0L || length(seqnames) == 0L) {
        masks <- RdaCollection(masks_dirpath, character(0))
    } else {
        masks <- RdaCollection(masks_dirpath, paste0(seqnames, ".masks"))
    }

    new("BSgenome", genome_description,
        pkgname=seqs_pkgname,
        single_sequences=single_sequences,
        multiple_sequences=multiple_sequences,
        source_url=source_url,
        user_seqnames=user_seqnames,
        .seqs_cache=new.env(parent=emptyenv()),
        .link_counts=new.env(parent=emptyenv()),
        nmask_per_seq=nmask_per_seq,
        masks=masks
    )
}

MaskedBSgenome <- function(ref_bsgenome,
                           masks_pkgname, nmask_per_seq, masks_dirpath)
{
    masks <- RdaCollection(masks_dirpath,
                           paste0(seqnames(ref_bsgenome), ".masks"))
    new("MaskedBSgenome", ref_bsgenome,
        .seqs_cache=new.env(parent=emptyenv()),
        .link_counts=new.env(parent=emptyenv()),
        masks_pkgname=masks_pkgname,
        nmask_per_seq=as.integer(nmask_per_seq),
        masks=masks) 
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Coercion
###

setMethod("as.list", "BSgenome",
    function(x)
        lapply(setNames(seqnames(x), seqnames(x)),
               function(seqname) x[[seqname]])
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "show" method
###

.SHOW_BSGENOME_PREFIX <- "# "
.SHOW_SEQSECTION_PREFIX <- "#   "

setMethod("show", "BSgenome",
    function(object)
    {
        mystrwrap <- function(line)
            writeLines(strwrap(line, width=getOption("width")+1,
                               exdent=0L, prefix=.SHOW_BSGENOME_PREFIX))
        if (!is.na(object@common_name))
            cat(object@common_name, "genome:\n")
        GenomeInfoDb:::showGenomeDescription(object,
                                             margin=.SHOW_BSGENOME_PREFIX)
        if (!is.null(SNPlocs_pkgname(object)))
            cat(.SHOW_BSGENOME_PREFIX, "with SNPs injected from package: ", SNPlocs_pkgname(object), "\n", sep="")
        if (length(mseqnames(object)) == 0L) {
            what <- "sequence"
        } else {
            cat(.SHOW_BSGENOME_PREFIX, "\n", sep="")
            what <- "single sequence"
        }
        mystrwrap(paste0(length(seqnames(object)), " ", what, "s:"))
        if (length(seqnames(object)) != 0L) {
            printAtomicVectorInAGrid(seqnames(object),
                                     prefix=.SHOW_SEQSECTION_PREFIX)
            mystrwrap(paste0("(use 'seqnames()' to see all the ",
                             what, " names, ",
                             "use the '$' or '[[' operator to access a given ",
                             what, ")"))
        } else {
            cat(.SHOW_SEQSECTION_PREFIX, "NONE\n", sep="")
        }
        if (length(mseqnames(object)) != 0L) {
            cat(.SHOW_BSGENOME_PREFIX, "\n", sep="")
            mystrwrap("multiple sequences:")
            printAtomicVectorInAGrid(mseqnames(object),
                                     prefix=.SHOW_SEQSECTION_PREFIX)
            mystrwrap(paste0("(use 'mseqnames()' to see all the ",
                             "multiple sequence names, ",
                             "use the '$' or '[[' operator to access a given ",
                             "multiple sequence)"))
        }
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### List-element extraction (with [[).
###

.loadBSgenomeSequence <- function(x, seqname, user_seqname)
{
    idx <- match(user_seqname, x@user_seqnames)
    if (is.na(idx)) {  # multiple sequence
        if (substr(seqname, 1L, 8L) == "upstream") {
            msg <- c(
                "  Starting with BioC 3.0, the upstream sequences ",
                "are defunct.\n",
                "  However they can easily be extracted ",
                "from the full genome\n  sequences with something like ",
                "(for example for hg19):\n\n",
                "      library(TxDb.Hsapiens.UCSC.hg19.knownGene)\n",
                "      txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene\n",
                "      gn <- sort(genes(txdb))\n",
                "      up1000 <- flank(gn, width=1000)\n",
                "      library(BSgenome.Hsapiens.UCSC.hg19)\n",
                "      genome <- BSgenome.Hsapiens.UCSC.hg19\n",
                "      up1000seqs <- getSeq(genome, up1000)\n\n",
                "  IMPORTANT: Make sure you use a TxDb package (or ",
                "TranscriptDb object)\n  that contains a gene model ",
                "based on the exact same reference genome\n",
                "  as the BSgenome object you pass to getSeq(). Note that ",
                "you can make\n  your own custom TranscriptDb object from ",
                "various annotation resources.\n",
                "  See the makeTxDbFromUCSC(), ",
                "makeTxDbFromBiomart(), and\n",
                "  makeTxDbFromGFF() functions in the ",
                "GenomicFeatures package."
            )
            .Defunct(msg=paste0(msg, collapse=""))
        }
        ans <- x@multiple_sequences[[seqname]]
        return(ans)
    }
    # single sequence
    ans <- x@single_sequences[[seqname]]
    ## Check the length of the sequence
    if (length(ans) != seqlengths(x)[[user_seqname]]) {
        stop(user_seqname, " sequence does not have the expected length. ",
             "May be the data on disk is corrupted?")
    }
    ## Inject the SNPs, if any
    snps <- snplocs(x, seqname)
    if (!is.null(snps))
        .inplaceReplaceLetterAt(ans, snps$loc, snps$alleles_as_ambig)
    ## Load and set the built-in masks, if any
    nmask_per_seq <- length(masknames(x))
    if (nmask_per_seq > 0L) {
        masks_objname <- paste0(seqname, ".masks")
        builtinmasks <- x@masks[[masks_objname]]
        if (length(builtinmasks) < nmask_per_seq) {
            masks_filepath <- rdaPath(x@masks, masks_objname)
            stop("expecting ", nmask_per_seq, " built-in masks per ",
                 "single sequence, found only ", length(builtinmasks),
                 " in file '", masks_filepath, "'. ",
                 "May be the data on disk is corrupted?")
        }
        if (length(builtinmasks) > nmask_per_seq)
            builtinmasks <- builtinmasks[seq_len(nmask_per_seq)]
        if (!identical(names(builtinmasks), masknames(x))) {
            masks_filepath <- rdaPath(x@masks, masks_objname)
            stop("mask names found in file '", masks_filepath, "' are not ",
                 "identical to the names returned by masknames(). ",
                 "May be the data on disk is corrupted?")
        }
        masks(ans) <- builtinmasks
    }
    ans
}

.getBSgenomeSequence <- function(x, seqname, user_seqname)
{
    seqs_cache <- x@.seqs_cache
    ## Using the 'if (!exists()) assign(); get()' approach is NOT 100%
    ## reliable:
    ##
    ##   if (!exists(seqname, envir=seqs_cache, inherits=FALSE)) {
    ##       ...
    ##       assign(seqname, ans, envir=seqs_cache)
    ##   }
    ##   get(seqname, envir=seqs_cache, inherits=FALSE)
    ##
    ## because the symbol (seqname) can disappear from the cache between
    ## the moment we test for its presence and the moment we try to get it.
    ## It's not me being paranoid, we've seen this happen! One possible
    ## explanation for this is that the symbol was candidate for removal
    ## from the cache but that removal didn't happen yet because gc() had
    ## not yet been called (removal from the cache is implemented thru the
    ## finalizers registered on the objects that are copied from the cache
    ## and made available to the user). Then the call to get() would trigger
    ## garbbage collection and that in turn would trigger the removal of
    ## the symbol *before* get() had a chance to get to it. So now we use
    ## the 'try(get(...))' approach, which hopefully is 100% reliable!
    ans <- try(get(seqname, envir=seqs_cache, inherits=FALSE), silent=TRUE)
    if (is(ans, "try-error")) {
        ans <- .loadBSgenomeSequence(x, seqname, user_seqname)
        if (getOption("verbose"))
            cat("caching ", seqname, "\n", sep="")
        assign(seqname, ans, envir=seqs_cache)
    }
    .linkToCachedObject(ans) <- .newLinkToCachedObject(
                                    seqname,
                                    seqs_cache,
                                    x@.link_counts)
    ans
}

setMethod("[[", "BSgenome",
    function(x, i, j, ...)
    {
        ## 'x' is guaranteed to be a "BSgenome" object (if it's not, then the
        ## method dispatch algo will not even call this method), so nargs() is
        ## guaranteed to be >= 1
        if (nargs() >= 3L)
            stop("too many subscripts")
        subscripts <- list(...)
        if (!missing(i))
            subscripts$i <- i
        if (!missing(j))
            subscripts$j <- j
        ## At this point, 'subscripts' should be guaranteed
        ## to be of length <= 1
        if (length(subscripts) == 0L)
            stop("no index specified")
        i <- subscripts[[1]]
        if (length(i) < 1)
            stop("attempt to select less than one element")
        if (length(i) > 1)
            stop("attempt to select more than one element")
        if (is.character(i)) {
            user_seqname <- try(match.arg(i, names(x)), silent=TRUE)
            if (is(user_seqname, "try-error"))
                stop("no such sequence")
        } else {
            if (!is.numeric(i) || is.na(i))
                stop("no such sequence")
            i <- as.integer(i)
            if (i < 1L || length(x) < i)
                stop("no such sequence")
            user_seqname <- names(x)[i]
        }
        ## Translate user-supplied sequence named back to original name.
        idx <- match(user_seqname, x@user_seqnames)
        if (is.na(idx)) {
            seqname <- user_seqname  # multiple sequence
        } else {
            seqname <- names(x@user_seqnames)[[idx]]  # single sequence
        }
        .getBSgenomeSequence(x, seqname, user_seqname)
    }
)

setMethod("$", "BSgenome",
    function(x, name) x[[name]]
)

Try the BSgenome package in your browser

Any scripts or data that you put into this service are public.

BSgenome documentation built on May 6, 2019, 2:29 a.m.