R/bubbles-methods.R

Defines functions .extract_bubbles_from_txpathmat .get_bubble_variants .make_logical_matrix_from_bit_strings .make_bit_strings_from_logical_matrix .is_bubble2 .is_bubble .get_sgnodetypes_from_txpathmat

### =========================================================================
### "bubbles" methods
### -------------------------------------------------------------------------


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .get_sgnodetypes_from_txpathmat()
###

### Supported types are 1 (exon start), 2 (exon end), 0 (R or L nodes).
### Returns an integer vector of length the nb of cols in 'txpathmat' and
### named with 'txpathmat' colnames.
### TODO: (a) Add an sgnodetypes() accessor to sgedges-methods.R, (b) move
### the .get_sgnodetypes_from_txpathmat() helper to that file, and (c) use
### it internally in sgnodetypes().
.get_sgnodetypes_from_txpathmat <- function(txpathmat, check.matrix=FALSE)
{
    ans <- integer(ncol(txpathmat))
    names(ans) <- colnames(txpathmat)
    for (i in seq_len(nrow(txpathmat))) {
        idx <- which(txpathmat[i, , drop=FALSE])
        if (length(idx) <= 2L)
            next()
        idx <- idx[-c(1L, length(idx))]
        exon_start_idx <- idx[c(TRUE, FALSE)]
        exon_end_idx <- idx[c(FALSE, TRUE)]
        if (check.matrix) {
            if (any(ans[exon_start_idx] == 2L) || any(ans[exon_end_idx] == 1L))
                stop("invalid matrix of transcript paths: ",
                     "some columns in 'txpathmat' seem to correspond ",
                     "at the same time to an exon start and an exon end")
        }
        ans[exon_start_idx] <- 1L
        ans[exon_end_idx] <- 2L
    }
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .extract_bubbles_from_txpathmat()
###

### We must have 1 <= i, j <= ncol(txpathmat), and j - i >= 2. This is not
### checked!
.is_bubble <- function(txpathmat, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    if (sum(txbase) <= 1L)
        return(FALSE)
    for (k in (i+1L):(j-1L))
        if (all(txpathmat[ , k] >= txbase))
            return(FALSE)
    TRUE
}

### A fancy alternative to .is_bubble() that tries to avoid the for loop.
### Surprisingly, it turned out to be slower than .is_bubble() in practise.
.is_bubble2 <- function(txpathmat, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    if (sum(txbase) <= 1L)
        return(FALSE)
    all(colSums(txpathmat[ , (i+1L):(j-1L), drop=FALSE] < txbase) >= 1)
}

### Assumes 'm' to be a logical matrix. Not checked.
.make_bit_strings_from_logical_matrix <- function(m)
{
    apply(m, 1L, function(x) paste(as.integer(x), collapse=""))
}

### Assumes 's' to be a vector of equal-length strings made of 0s and 1s. Not
### checked.
.make_logical_matrix_from_bit_strings <- function(s)
{
    all_letters <- unlist(strsplit(s, NULL, fixed=TRUE), use.names=FALSE)
    matrix(as.logical(as.integer(all_letters)), nrow=length(s), byrow=TRUE,
           dimnames=list(names(s), NULL))
}

### Returns a DataFrame with 1 row per variant and the following cols:
### partition <CharacterList>, path <CharacterList>, and code <character>.
.get_bubble_variants <- function(txpathmat, sgnodetypes, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    bubble_submat <- txpathmat[txbase, (i+1L):(j-1L), drop=FALSE]

    ## Remove cols with FALSEs only.
    bubble_submat <- bubble_submat[ , colSums(bubble_submat) != 0L, drop=FALSE]
    bubble_submat_rownames <- rownames(bubble_submat)
    bubble_submat_colnames <- colnames(bubble_submat)

    ## Compute variant paths (1 per row).
    ans_path <- CharacterList(
                    lapply(seq_len(nrow(bubble_submat)),
                           function(i)
                               bubble_submat_colnames[bubble_submat[i, ]])
                )

    ## Compute variant relative paths (1 per row).
    ans_rpath <- IntegerList(
                    lapply(seq_len(nrow(bubble_submat)),
                           function(i)
                               which(bubble_submat[i, ]))
                )

    ## Compute variant code (1 per row).
    ans_code <- sapply(seq_len(length(ans_path)),
                       function(k)
                       {
                           path <- ans_path[[k]]
                           path_len <- length(path)
                           if (path_len == 0L)
                               return("0")
                           types <- c("-", "^")[sgnodetypes[path]]
                           ## Sanity check would fail if 'sgnodetypes[path]'
                           ## contained 0s but this should never happen.
                           if (length(types) != path_len)
                               stop("some splicing sites in variant path ",
                                    "are of type 0")
                           if (i == 1L)
                               types[1L] <- "["
                           if (j == ncol(txpathmat))
                               types[length(types)] <- "]"
                           paste0(ans_rpath[[k]], types, collapse="")
                       })

    ## Order the variants by lexicographic order on their code.
    oo <- order(ans_code)
    bubble_submat_rownames <- bubble_submat_rownames[oo]
    ans_path <- ans_path[oo]
    #ans_rpath <- ans_rpath[oo]
    ans_code <- ans_code[oo]

    ## Identify unique variants.
    ans_code1 <- ans_code[-length(ans_code)]
    ans_code2 <- ans_code[-1L]
    is_not_dup <- c(TRUE, ans_code1 != ans_code2)
    ans_path <- ans_path[is_not_dup]
    #ans_rpath <- ans_rpath[is_not_dup]
    ans_code <- ans_code[is_not_dup]

    ## Compute variant partitions.
    ans_partition <- unname(splitAsList(bubble_submat_rownames,
                                        cumsum(is_not_dup)))

    ## Make and return the DataFrame.
    DataFrame(partition=ans_partition,
              path=ans_path,
              #rpath=ans_rpath,
              code=ans_code)
}

### Returns a DataFrame with 1 row per bubble and the following cols:
### source <character>, sink <character>, d <integer>, partitions
### <CharacterList>, paths <CharacterList>, AScode <character>, and
### description <character>.
#.extract_bubbles_from_txpathmat <- function(txpathmat, outdeg, indeg)
.extract_bubbles_from_txpathmat <- function(txpathmat)
{
    ## The call to .get_bubble_variants() in the inner loop below calls
    ## order() on a character vector. Unfortunately the behavior of order()
    ## on a character vector depends on the collating sequence of the locale
    ## in use!
    ## So we set LC_COLLATE to C so that (1) the ouput of order() on a
    ## character vector is platform/country independent, and (2) it will
    ## behave the same way when called in the context of the unit tests
    ## run by 'R CMD check' ('R CMD check' also sets the LC_COLLATE to C
    ## when running the tests) vs when called in the context of an
    ## interactive session.
    ## Another advantage of doing this is that order() seems to be at least
    ## 2x faster when LC_COLLATE is set to C vs to something like en_US.UTF-8,
    ## which comes as a pleasant surprise!
    ## TODO: Maybe we should define an strorder() function in
    ## S4Vectors/R/str-utils.R for portable/deterministic ordering of a
    ## character vector. See R/utils.R in the GenomicFeatures package
    ## for a similar discussion about using rank() on a character vector.
    prev_locale <- Sys.getlocale("LC_COLLATE")
    Sys.setlocale("LC_COLLATE", "C")
    on.exit(Sys.setlocale("LC_COLLATE", prev_locale))

    sgnodetypes <- .get_sgnodetypes_from_txpathmat(txpathmat)
    ans_source <- ans_sink <- ans_AScode <- character(0)
    ans_d <- integer(0)
    ans_partitions <- ans_paths <- CharacterList()
    ## Surprisingly, the outdeg/indeg trick turned out to not make any
    ## noticeable speed difference in practise.
    #ii0 <- which(outdeg >= 2L)
    #jj0 <- which(indeg >= 2L)
    #for (i in ii0) {
    #    jj <- jj0[jj0 >= i + 2L]
    #    for (j in jj) {
    ncol <- ncol(txpathmat)
    for (i in 1:(ncol-2L)) {
        for (j in (i+2L):ncol) {
            if (!.is_bubble(txpathmat, i, j))
                next
            bubble_variants <- .get_bubble_variants(txpathmat, sgnodetypes,
                                                    i, j)
            bubble_d <- nrow(bubble_variants)
            if (bubble_d <= 1L)
                next
            ans_source <- c(ans_source, colnames(txpathmat)[i])
            ans_sink <- c(ans_sink, colnames(txpathmat)[j])
            ans_d <- c(ans_d, bubble_d)
            ## Format the bubble partitions.
            bubble_partitions <- bubble_variants[ , "partition"]
            bubble_partitions <- sapply(bubble_partitions, base::paste, collapse=",")
            bubble_partitions <- paste0("{", bubble_partitions, "}")
            ans_partitions <- c(ans_partitions,
                                CharacterList(bubble_partitions))
            ## Format the bubble paths.
            bubble_paths <- bubble_variants[ , "path"]
            bubble_paths <- sapply(bubble_paths, base::paste, collapse=",")
            bubble_paths <- paste0("{", bubble_paths, "}")
            ans_paths <- c(ans_paths, CharacterList(bubble_paths))
            ## Format the bubble AScode.
            bubble_AScode <- paste(bubble_variants[ , "code"], collapse=",")
            ans_AScode <- c(ans_AScode, bubble_AScode)
        }
    }
    DataFrame(source=ans_source,
              sink=ans_sink,
              d=ans_d,
              partitions=ans_partitions,
              paths=ans_paths,
              AScode=ans_AScode,
              description=unname(ASCODE2DESC[ans_AScode]))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### bubbles()
###

setGeneric("bubbles", signature="x",
    function(x) standardGeneric("bubbles")
)

setMethod("bubbles", "SplicingGraphs",
    function(x)
    {
        if (length(x) != 1L)
            stop("'x' must be a SplicingGraphs object of length 1")
        bubbles_cache <- x@.bubbles_cache
        ans <- try(get(names(x), envir=bubbles_cache, inherits=FALSE),
                   silent=TRUE)
        if (is(ans, "try-error")) {
            ans <- callNextMethod()
            assign(names(x), ans, envir=bubbles_cache)
        }
        ans
    }
)

setMethod("bubbles", "ANY",
    function(x)
    {
        txpath <- txpath(x)
        bubbles(txpath)
    }
)

setMethod("bubbles", "IntegerList",
    function(x)
    {
        txpathmat <- make_matrix_from_txpath(x)
        #outdeg <- outdeg(x)
        #indeg <- indeg(x)
        #.extract_bubbles_from_txpathmat(txpathmat, outdeg, indeg)
        .extract_bubbles_from_txpathmat(txpathmat)
    }
)

### TODO: Add "bubbles" methods for data.frame and DataFrame objects.

Try the SplicingGraphs package in your browser

Any scripts or data that you put into this service are public.

SplicingGraphs documentation built on Nov. 8, 2020, 5:58 p.m.