R/utils.R
In GenomicFeatures: Conveniently import and query gene models

### =========================================================================
### Miscellaneous low-level utils
### -------------------------------------------------------------------------
###
### Unless stated otherwise, nothing in this file is exported.
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### DB related.
###
### Most of this stuff was copy/pasted from AnnotationDbi (trying to avoid
### depending on AnnotationDbi for now).
###

### Environment for storing run-time objects
RTobjs <- new.env(hash=TRUE, parent=emptyenv())

assign("debugSQL", FALSE, envir=RTobjs)

debugSQL <- function()
{
    debugSQL <- !get("debugSQL", envir=RTobjs)
    assign("debugSQL", debugSQL, envir=RTobjs)
    debugSQL
}


### Use dbQuery(conn, SQL, 1) instead of dbQuery(conn, SQL)[[1]],
### it's much safer!
dbEasyQuery <- function(conn, SQL, j0=NA)
{
    if (get("debugSQL", envir=RTobjs)) {
        if (!is.character(SQL) || length(SQL) != 1L || is.na(SQL))
            stop("[debugSQL] 'SQL' must be a single string")
        cat("[debugSQL] SQL query: ", SQL, "\n", sep="")
        st <- system.time(data0 <- dbGetQuery(conn, SQL))
        cat("[debugSQL]      time: ", st["user.self"], " seconds\n", sep="")
    } else {
        data0 <- dbGetQuery(conn, SQL)
    }
    if (is.na(j0))
        return(data0)
    ## Needed to deal properly with data frame with 0 column ("NULL data
    ## frames with 0 rows") returned by RSQLite when the result of a SELECT
    ## query has 0 row
    if (nrow(data0) == 0L)
        character(0)
    else
        data0[[j0]]
}

### TODO: Put this in AnnotationDbi.
queryAnnotationDb <- function(annotationdb, sql)
{
    AnnotationDbi:::dbEasyQuery(dbconn(annotationdb),
                                paste(sql, collapse="\n"))
}

insert_data_into_table <- function(conn, table, data)
{
    stopifnot(is.list(data))
    placeholders <- paste(rep.int("?", length(data)), collapse=",")
    SQL <- sprintf("INSERT INTO %s VALUES (%s)", table, placeholders)
    ## dbExecute() emits annoying warnings if 'params' is a named list or if
    ## some of its list elements are factors.
    params <- unname(as.list(data))
    params <- lapply(params,
        function(x) if (is.factor(x)) as.character(x) else x)
    dbExecute(conn, SQL, params=params)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Data frame related.
###
### TODO: Find a better home for these low-level data.frame utils.
###

### Not data.frame specific. Would also work on any matrix-like object.
has_col <- function(x, colnames) {colnames %in% colnames(x)}

check_colnames <- function(x, required_colnames,
                              optional_colnames=character(0),
                              x_what="x")
{
    if (!is.data.frame(x))
        stop(wmsg("'", x_what, "' must be a data frame"))
    x_colnames <- colnames(x)
    if (!all(required_colnames %in% x_colnames))
        stop(wmsg("'", x_what, "' must have at least the following col",
                  ifelse(length(required_colnames) >= 2L, "s", ""),
                  ": ", paste(required_colnames, collapse=", ")))
    supported_colnames <- c(required_colnames, optional_colnames)
    is_supported_col <- x_colnames %in% supported_colnames
    if (any(duplicated(x_colnames[is_supported_col])))
        stop(wmsg("'", x_what, "' has duplicated colnames"))
    ignored_idx <- which(!is_supported_col)
    if (length(ignored_idx) != 0L)
        warning(wmsg("ignoring the following col",
                     ifelse(length(ignored_idx) >= 2L, "s", ""),
                     " in '", x_what, "': ",
                     paste(x_colnames[ignored_idx], collapse=", ")))
}

makeZeroRowDataFrame <- function(col2class)
{
    if (!is.character(col2class) || is.null(names(col2class)))
        stop("'col2class' must be a named character vector")
    as.data.frame(lapply(col2class, function(class) get(class)()),
                  stringsAsFactors=FALSE)
}

### Sets the class of (all or some of) the columns of a data.frame.
### Typical use:
###   x <- setDataFrameColClass(x, c(colA="integer", colB="factor"))
### Note that if 'x' has more than one "colA" col, then *all* of them are
### coerced to integer.
setDataFrameColClass <- function(x, col2class, drop.extra.cols=FALSE)
{
    if (!is.data.frame(x))
        stop("'x' must be a data.frame")
    if (!is.character(col2class) || is.null(names(col2class)))
        stop("'col2class' must be a named character vector")
    if (!all(names(col2class) %in% colnames(x)))
        stop("'col2class' has invalid names")
    if (!isTRUEorFALSE(drop.extra.cols))
        stop("'drop.extra.cols' must be TRUE or FALSE")
    if (drop.extra.cols) {
        col_idx <- which(colnames(x) %in% names(col2class))
    } else {
        col_idx <- seq_len(ncol(x))
    }
    tmp <- lapply(col_idx,
                  function(j)
                  {
                      col <- x[[j]]
                      colname <- colnames(x)[j]
                      if (!(colname %in% names(col2class)))
                          return(col)
                      class <- col2class[[colname]]
                      FUNname <- paste("as", class, sep=".")
                      if (exists(FUNname) && is.function(FUN <- get(FUNname)))
                          return(FUN(col))
                      as(col, class)
                  })
    names(tmp) <- colnames(x)[col_idx]
    return(data.frame(tmp, check.names=FALSE, stringsAsFactors=FALSE))
}

### Acts like an SQL *inner* join.
### 'x' must be a data frame. 'name2val' must be a named atomic vector or
### a named factor. 'join_colname' must be the name of the col in 'x' whose
### values are matched against 'names(name2val)'. 'vals_colname' must be the
### name of the col that will be populated with the appropriate 'name2val'
### vals and bound to 'x'.
### Note that this acts like an SQL *inner* join, not a *left* join, i.e.
### rows in 'x' that can't be mapped to a value in 'name2val' are dropped.
joinDataFrameWithName2Val <- function(x, join_colname, name2val, vals_colname)
{
    if (!is.data.frame(x))
        stop("'x' must be a data.frame")
    if (!isSingleString(join_colname) || !has_col(x, join_colname))
        stop("'join_colname' must be a valid colname for 'x'")
    if (!is.vector(name2val) && !is.factor(name2val))
        stop("'name2val' must be a vector (or factor)")
    if (!is.atomic(name2val) || is.null(names(name2val)))
        stop("'name2val' must be atomic and have names")
    if (!isSingleString(vals_colname))
        stop("invalid 'vals_colname'")
    join_col <- as.character(x[[join_colname]])
    common_names <- intersect(join_col, names(name2val))
    name2val <- name2val[names(name2val) %in% common_names]
    x <- S4Vectors:::extract_data_frame_rows(x, join_col %in% common_names)
    tmp <- split(as.vector(name2val), names(name2val))
    ## as.character() is required below just because 'x[[join_colname]]'
    ## could be a factor (subsetting by a factor is equivalent to subsetting
    ## by an integer vector but this is not what we want here).
    tmp <- tmp[as.character(x[[join_colname]])]
    x <- S4Vectors:::extract_data_frame_rows(x,
                             rep.int(seq_len(nrow(x)), elementNROWS(tmp)))
    if (nrow(x) == 0L)
        vals <- name2val[FALSE]
    else if (is.factor(name2val))
        vals <- factor(unname(unlist(tmp)), levels=levels(name2val))
    else
        vals <- unname(unlist(tmp))
    x[[vals_colname]] <- vals
    x
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### ID assignment and/or reassignment.
###

### Mimicking the interface of chartr().
### If 'old_ids' and 'ids' are character vectors, then
### 'translateIds(old_ids, new_ids, ids)' is equivalent to
### 'names(new_ids) <- old_ids; new_ids[ids]'.
translateIds <- function(old_ids, new_ids, ids)
{
    if (!is.atomic(old_ids) || !is.atomic(new_ids) || !is.atomic(ids))
        stop("'old_ids', 'new_ids' and 'ids' must be atomic vectors")
    if (length(old_ids) != length(new_ids))
        stop("'old_ids' and 'new_ids' must have the same length")
    new_ids[match(ids, old_ids)]
}

### Returns the vector of ids such that 'unique(x)[ids, ]' is identical
### to 'x' (in the same way that 'levels(f)[f]' is identical to
### 'as.vector(f)' when 'f' is a character factor).
### This unambiguously defines 'ids'. In particular, it's not Locale
### specific, despite the fact that the current implementation uses a
### sorting approach.
### TODO: Remove! (not used anymore)
makeIdsForUniqueDataFrameRows <- function(x)
{
    if (!is.data.frame(x))
        stop("'x' must be a data.frame")
    x_order <- do.call(order, x)
    x_dups <- duplicated(x)
    ## First we make "provisory" ids. Those ids *are* Locale specific.
    prov_ids <- integer(nrow(x))
    prov_ids[x_order] <- cumsum(!x_dups[x_order])
    ## Convert the "provisory" ids into the final ids. The final ids are
    ## *not* Locale specific anymore.
    as.integer(factor(prov_ids, levels=unique(prov_ids)))
}

### 'name' and 'type' each must be either NULL, or an integer or character
### vector, or a factor. Possibly with NAs. One of them can be NULL but not
### both of them. If none of them is NULL, they must have the same length N.
### Returns an integer vector of length N.
.rank_name_type <- function(name, type)
{
    ## We set LC_COLLATE to C so (1) the ouput of rank() on a character vector
    ## is platform/country independent, and (2) it will behave the same way
    ## when called in the context of the unit tests run by 'R CMD check'
    ## ('R CMD check' also sets the LC_COLLATE to C when running the tests)
    ## vs when called in the context of an interactive session.
    ## TODO: Maybe we should define an strrank() function in
    ## S4Vectors/R/str-utils.R for portable/deterministic ranking of a
    ## character vector. See R/bubbles-methods.R in the SplicingGraphs package
    ## for a similar discussion about using order() on a character vector.
    prev_locale <- Sys.getlocale("LC_COLLATE")
    Sys.setlocale("LC_COLLATE", "C")
    on.exit(Sys.setlocale("LC_COLLATE", prev_locale))
    if (!is.integer(name)) {
        if (is.null(name)) {
            name <- integer(length(type))
        } else if (is.numeric(name)) {
            name <- as.integer(name)
        } else {
            if (!is.character(name))
                name <- as.character(name)
            name <- rank(name, na.last="keep", ties.method="min")
        }
    }
    ## Features with no name (e.g. tx_name is NA) go last.
    name[is.na(name)] <- .Machine$integer.max
    if (!is.integer(type)) {
        if (is.null(type)) {
            type <- integer(length(name))
        } else if (is.factor(type) || is.numeric(type)) {
            type <- as.integer(type)
        } else {
            if (!is.character(type))
                type <- as.character(type)
            type <- rank(type, na.last="keep", ties.method="min")
        }
    }
    ## Features with no type (e.g. tx_type is NA) go last.
    type[is.na(type)] <- .Machine$integer.max
    oo <- orderIntegerPairs(name, type)
    ans <- integer(length(oo))
    ans[oo] <- seq_along(oo)
    sm <- selfmatchIntegerPairs(name, type)
    ans[sm]
}

### 'chrom_ids' (integer vector, no NAs), 'strand' (character vector, factor,
### or anything supported by a "strand" method, no NAs), 'start' (integer
### vector, no NAs), and 'end' (integer vector, no NAs) must have the same
### length N (number of features).
### If 'name' is not NULL, it must be character vector or factor of length N,
### possibly with NAs.
### Returns an integer vector of length N containing one id per feature.
makeFeatureIds <- function(chrom_ids, strand, start, end,
                           name=NULL, type=NULL,
                           same.id.for.dups=FALSE)
{
    if (is.factor(strand)) {
        ## If levels contain "+", "-" and/or "*" in the wrong order then
        ## we coerce back to character.
        m <- match(levels(strand), levels(strand()))
        m <- m[!is.na(m)]
        if (!all(diff(m) >= 1L))
            strand <- as.character(strand)
    }
    if (!is.factor(strand))
        strand <- strand(strand)
    a <- chrom_ids
    b <- as.integer(strand)
    c <- start
    d <- end
    if (!(is.null(name) && is.null(type))) {
        a <- 3L * a + b
        b <- c
        c <- d
        d <- .rank_name_type(name, type)
    }
    if (!same.id.for.dups) {
        oo <- orderIntegerQuads(a, b, c, d)
        ans <- integer(length(oo))
        ans[oo] <- seq_len(length(oo))
        return(ans)
    }
    ## There should be a better way to do this...
    is_not_dup <- !duplicatedIntegerQuads(a, b, c ,d)
    ua <- a[is_not_dup]
    ub <- b[is_not_dup]
    uc <- c[is_not_dup]
    ud <- d[is_not_dup]
    oo <- orderIntegerQuads(ua, ub, uc, ud)
    ua <- ua[oo]
    ub <- ub[oo]
    uc <- uc[oo]
    ud <- ud[oo]
    matchIntegerQuads(a, b, c, d, ua, ub, uc, ud)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Miscellaneous.
###

### 'exon_count' must be a vector of positive integers and 'tx_strand' a
### character vector with "+" or "-" values. Both vectors must have the
### same length.
make_exon_rank_col <- function(exon_count, tx_strand)
{
    ans <- lapply(seq_len(length(exon_count)),
        function(i)
        {
            if (tx_strand[i] == "+")
                seq_len(exon_count[i])
            else
                (exon_count[i]):1L
        }
    )
    unlist(ans)
}