R/bubbles-methods.R

Defines functions .extract_bubbles_from_txpathmat .get_bubble_variants .make_logical_matrix_from_bit_strings .make_bit_strings_from_logical_matrix .is_bubble2 .is_bubble .get_sgnodetypes_from_txpathmat

### =========================================================================
### "bubbles" methods
### -------------------------------------------------------------------------


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .get_sgnodetypes_from_txpathmat()
###

### Supported types are 1 (exon start), 2 (exon end), 0 (R or L nodes).
### Returns an integer vector of length the nb of cols in 'txpathmat' and
### named with 'txpathmat' colnames.
### TODO: (a) Add an sgnodetypes() accessor to sgedges-methods.R, (b) move
### the .get_sgnodetypes_from_txpathmat() helper to that file, and (c) use
### it internally in sgnodetypes().
.get_sgnodetypes_from_txpathmat <- function(txpathmat, check.matrix=FALSE)
{
    ans <- integer(ncol(txpathmat))
    names(ans) <- colnames(txpathmat)
    for (i in seq_len(nrow(txpathmat))) {
        idx <- which(txpathmat[i, , drop=FALSE])
        if (length(idx) <= 2L)
            next()
        idx <- idx[-c(1L, length(idx))]
        exon_start_idx <- idx[c(TRUE, FALSE)]
        exon_end_idx <- idx[c(FALSE, TRUE)]
        if (check.matrix) {
            if (any(ans[exon_start_idx] == 2L) || any(ans[exon_end_idx] == 1L))
                stop("invalid matrix of transcript paths: ",
                     "some columns in 'txpathmat' seem to correspond ",
                     "at the same time to an exon start and an exon end")
        }
        ans[exon_start_idx] <- 1L
        ans[exon_end_idx] <- 2L
    }
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .extract_bubbles_from_txpathmat()
###

### We must have 1 <= i, j <= ncol(txpathmat), and j - i >= 2. This is not
### checked!
.is_bubble <- function(txpathmat, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    if (sum(txbase) <= 1L)
        return(FALSE)
    for (k in (i+1L):(j-1L))
        if (all(txpathmat[ , k] >= txbase))
            return(FALSE)
    TRUE
}

### A fancy alternative to .is_bubble() that tries to avoid the for loop.
### Surprisingly, it turned out to be slower than .is_bubble() in practise.
.is_bubble2 <- function(txpathmat, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    if (sum(txbase) <= 1L)
        return(FALSE)
    all(colSums(txpathmat[ , (i+1L):(j-1L), drop=FALSE] < txbase) >= 1)
}

### Assumes 'm' to be a logical matrix. Not checked.
.make_bit_strings_from_logical_matrix <- function(m)
{
    apply(m, 1L, function(x) paste(as.integer(x), collapse=""))
}

### Assumes 's' to be a vector of equal-length strings made of 0s and 1s. Not
### checked.
.make_logical_matrix_from_bit_strings <- function(s)
{
    all_letters <- unlist(strsplit(s, NULL, fixed=TRUE), use.names=FALSE)
    matrix(as.logical(as.integer(all_letters)), nrow=length(s), byrow=TRUE,
           dimnames=list(names(s), NULL))
}

### Returns a DataFrame with 1 row per variant and the following cols:
### partition <CharacterList>, path <CharacterList>, and code <character>.
.get_bubble_variants <- function(txpathmat, sgnodetypes, i, j)
{
    txbase <- txpathmat[, i] & txpathmat[, j]
    bubble_submat <- txpathmat[txbase, (i+1L):(j-1L), drop=FALSE]

    ## Remove cols with FALSEs only.
    bubble_submat <- bubble_submat[ , colSums(bubble_submat) != 0L, drop=FALSE]
    bubble_submat_rownames <- rownames(bubble_submat)
    bubble_submat_colnames <- colnames(bubble_submat)

    ## Compute variant paths (1 per row).
    ans_path <- CharacterList(
                    lapply(seq_len(nrow(bubble_submat)),
                           function(i)
                               bubble_submat_colnames[bubble_submat[i, ]])
                )

    ## Compute variant relative paths (1 per row).
    ans_rpath <- IntegerList(
                    lapply(seq_len(nrow(bubble_submat)),
                           function(i)
                               which(bubble_submat[i, ]))
                )

    ## Compute variant code (1 per row).
    ans_code <- sapply(seq_len(length(ans_path)),
                       function(k)
                       {
                           path <- ans_path[[k]]
                           path_len <- length(path)
                           if (path_len == 0L)
                               return("0")
                           types <- c("-", "^")[sgnodetypes[path]]
                           ## Sanity check would fail if 'sgnodetypes[path]'
                           ## contained 0s but this should never happen.
                           if (length(types) != path_len)
                               stop("some splicing sites in variant path ",
                                    "are of type 0")
                           if (i == 1L)
                               types[1L] <- "["
                           if (j == ncol(txpathmat))
                               types[length(types)] <- "]"
                           paste0(ans_rpath[[k]], types, collapse="")
                       })

    ## Order the variants by lexicographic order on their code.
    oo <- order(ans_code)
    bubble_submat_rownames <- bubble_submat_rownames[oo]
    ans_path <- ans_path[oo]
    #ans_rpath <- ans_rpath[oo]
    ans_code <- ans_code[oo]

    ## Identify unique variants.
    ans_code1 <- ans_code[-length(ans_code)]
    ans_code2 <- ans_code[-1L]
    is_not_dup <- c(TRUE, ans_code1 != ans_code2)
    ans_path <- ans_path[is_not_dup]
    #ans_rpath <- ans_rpath[is_not_dup]
    ans_code <- ans_code[is_not_dup]

    ## Compute variant partitions.
    ans_partition <- unname(splitAsList(bubble_submat_rownames,
                                        cumsum(is_not_dup)))

    ## Make and return the DataFrame.
    DataFrame(partition=ans_partition,
              path=ans_path,
              #rpath=ans_rpath,
              code=ans_code)
}

### Returns a DataFrame with 1 row per bubble and the following cols:
### source <character>, sink <character>, d <integer>, partitions
### <CharacterList>, paths <CharacterList>, AScode <character>, and
### description <character>.
#.extract_bubbles_from_txpathmat <- function(txpathmat, outdeg, indeg)
.extract_bubbles_from_txpathmat <- function(txpathmat)
{
    ## The call to .get_bubble_variants() in the inner loop below calls
    ## order() on a character vector. Unfortunately the behavior of order()
    ## on a character vector depends on the collating sequence of the locale
    ## in use!
    ## So we set LC_COLLATE to C so that (1) the ouput of order() on a
    ## character vector is platform/country independent, and (2) it will
    ## behave the same way when called in the context of the unit tests
    ## run by 'R CMD check' ('R CMD check' also sets the LC_COLLATE to C
    ## when running the tests) vs when called in the context of an
    ## interactive session.
    ## Another advantage of doing this is that order() seems to be at least
    ## 2x faster when LC_COLLATE is set to C vs to something like en_US.UTF-8,
    ## which comes as a pleasant surprise!
    ## TODO: Maybe we should define an strorder() function in
    ## S4Vectors/R/str-utils.R for portable/deterministic ordering of a
    ## character vector. See R/utils.R in the GenomicFeatures package
    ## for a similar discussion about using rank() on a character vector.
    prev_locale <- Sys.getlocale("LC_COLLATE")
    Sys.setlocale("LC_COLLATE", "C")
    on.exit(Sys.setlocale("LC_COLLATE", prev_locale))

    sgnodetypes <- .get_sgnodetypes_from_txpathmat(txpathmat)
    ans_source <- ans_sink <- ans_AScode <- character(0)
    ans_d <- integer(0)
    ans_partitions <- ans_paths <- CharacterList()
    ## Surprisingly, the outdeg/indeg trick turned out to not make any
    ## noticeable speed difference in practise.
    #ii0 <- which(outdeg >= 2L)
    #jj0 <- which(indeg >= 2L)
    #for (i in ii0) {
    #    jj <- jj0[jj0 >= i + 2L]
    #    for (j in jj) {
    ncol <- ncol(txpathmat)
    for (i in 1:(ncol-2L)) {
        for (j in (i+2L):ncol) {
            if (!.is_bubble(txpathmat, i, j))
                next
            bubble_variants <- .get_bubble_variants(txpathmat, sgnodetypes,
                                                    i, j)
            bubble_d <- nrow(bubble_variants)
            if (bubble_d <= 1L)
                next
            ans_source <- c(ans_source, colnames(txpathmat)[i])
            ans_sink <- c(ans_sink, colnames(txpathmat)[j])
            ans_d <- c(ans_d, bubble_d)
            ## Format the bubble partitions.
            bubble_partitions <- bubble_variants[ , "partition"]
            bubble_partitions <- sapply(bubble_partitions, base::paste, collapse=",")
            bubble_partitions <- paste0("{", bubble_partitions, "}")
            ans_partitions <- c(ans_partitions,
                                CharacterList(bubble_partitions))
            ## Format the bubble paths.
            bubble_paths <- bubble_variants[ , "path"]
            bubble_paths <- sapply(bubble_paths, base::paste, collapse=",")
            bubble_paths <- paste0("{", bubble_paths, "}")
            ans_paths <- c(ans_paths, CharacterList(bubble_paths))
            ## Format the bubble AScode.
            bubble_AScode <- paste(bubble_variants[ , "code"], collapse=",")
            ans_AScode <- c(ans_AScode, bubble_AScode)
        }
    }
    DataFrame(source=ans_source,
              sink=ans_sink,
              d=ans_d,
              partitions=ans_partitions,
              paths=ans_paths,
              AScode=ans_AScode,
              description=unname(ASCODE2DESC[ans_AScode]))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### bubbles()
###

setGeneric("bubbles", signature="x",
    function(x) standardGeneric("bubbles")
)

setMethod("bubbles", "SplicingGraphs",
    function(x)
    {
        if (length(x) != 1L)
            stop("'x' must be a SplicingGraphs object of length 1")
        bubbles_cache <- x@.bubbles_cache
        ans <- try(get(names(x), envir=bubbles_cache, inherits=FALSE),
                   silent=TRUE)
        if (is(ans, "try-error")) {
            ans <- callNextMethod()
            assign(names(x), ans, envir=bubbles_cache)
        }
        ans
    }
)

setMethod("bubbles", "ANY",
    function(x)
    {
        txpath <- txpath(x)
        bubbles(txpath)
    }
)

setMethod("bubbles", "IntegerList",
    function(x)
    {
        txpathmat <- make_matrix_from_txpath(x)
        #outdeg <- outdeg(x)
        #indeg <- indeg(x)
        #.extract_bubbles_from_txpathmat(txpathmat, outdeg, indeg)
        .extract_bubbles_from_txpathmat(txpathmat)
    }
)

### TODO: Add "bubbles" methods for data.frame and DataFrame objects.
Bioconductor/SplicingGraphs documentation built on Nov. 3, 2024, 8:11 a.m.