R/utils.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

### =========================================================================
### Miscellaneous low-level utils
### -------------------------------------------------------------------------
###
### Nothing in this file is exported.
###


is_single_value <- function(x)
{
    is.vector(x) && is.atomic(x) && length(x) == 1L
}

is_primary_key <- function(x)
{
    !anyNA(x) && all(nzchar(x)) && !anyDuplicated(x)
}

stop_if_not_primary_key <- function(x, x_what="'x'")
{
    if (!is_primary_key(x))
        stop(wmsg(x_what, " must not contain NAs, ",
                  "empty strings, or duplicates"))
}

### 'x' must be a matrix-like or data-frame-like object.
drop_cols <- function(x, columns=character(0))
{
    stopifnot(is.character(columns))
    if (length(columns) == 0L)
        return(x)
    drop_idx <- match(columns, colnames(x))
    stopifnot(!anyNA(drop_idx))
    x[ , -drop_idx, drop=FALSE]
}

### 'x' must be a matrix-like or data-frame-like object.
rename_cols <- function(x, old=character(0), new=character(0))
{
    stopifnot(is.character(old),
              is.character(new),
              length(old) == length(new))
    if (length(old) == 0L)
        return(x)
    rename_idx <- match(old, colnames(x))
    stopifnot(!anyNA(rename_idx))
    colnames(x)[rename_idx] <- new
    x
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### solid_match() and solid_match2()
###

.one_to_many_msg <- function(culprits, from_what, to_what)
{
    culprits_names <- names(culprits)
    if (!is.null(culprits_names))
        culprits <- culprits_names
    in1string <- paste0(unique(culprits), collapse=", ")
    c(from_what, "(s) matched to more than 1 ", to_what, ": ", in1string)
}

### Like base::match() but:
###   1. NA is never matched (incomparable).
###   2. Raises an error if some elements in 'x' can be matched to more
###      than one element in 'table' (one-to-many mapping).
### IMPORTANT NOTE: The fast implementation below doesn't work if 'x'
### contains duplicates so we check this. This means that we cannot use
### it in join_dfs() below!
solid_match <- function(x, table,
                        x_what="'x' element", table_what="'table' element")
{
    ## We only support atomic vectors (this is unlike base::match()
    ## where 'x' and 'table' each can be a NULL or a list).
    stopifnot(is.vector(x), is.atomic(x),
              !anyDuplicated(x, incomparables=NA),
              is.vector(table), is.atomic(table))
    revm <- match(table, x, incomparables=NA)  # reverse match
    ambig_idx <- which(duplicated(revm, incomparables=NA_integer_))
    if (length(ambig_idx) != 0L) {
        uidx <- unique(revm[ambig_idx])
        stop(wmsg(.one_to_many_msg(x[uidx], x_what, table_what)))
    }
    ans <- rep.int(NA_integer_, length(x))
    ok <- !is.na(revm)
    ans[revm[ok]] <- which(ok)
    ans
}

### Like base::match() but:
###   1. NA is never matched (incomparable).
###   2. Raises an error if the forward or reverse mapping is one-to-many.
solid_match2 <- function(x, table,
                         x_what="'x' element", table_what="'table' element")
{
    ## We only support atomic vectors (this is unlike base::match()
    ## where 'x' and 'table' each can be a NULL or a list).
    stopifnot(is.vector(x), is.atomic(x),
              is.vector(table), is.atomic(table))
    hits <- findMatches(x, table, incomparables=NA)
    q_hits <- queryHits(hits)
    s_hits <- subjectHits(hits)
    q_ambig_idx <- which(duplicated(q_hits))
    if (length(q_ambig_idx) != 0L) {
        uidx <- unique(q_hits[q_ambig_idx])
        stop(wmsg(.one_to_many_msg(x[uidx], x_what, table_what)))
    }
    s_ambig_idx <- which(duplicated(s_hits))
    if (length(s_ambig_idx) != 0L) {
        uidx <- unique(s_hits[s_ambig_idx])
        stop(wmsg(.one_to_many_msg(table[uidx], table_what, x_what)))
    }
    ans <- rep.int(NA_integer_, length(x))
    ans[q_hits] <- s_hits
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### JOIN two data.frames
###
### In typical base R fashion, merge.data.frame() does all kinds of weird
### things with the column names and is very inefficient. Plus, independently
### of how 'sort' is set, the order of the rows in the result does not make
### sense. The order of the columns doesn't either.
###
### join_dfs() below does NOT mess around with the column names, preserves
### the order of the rows of the left data.frame, does not shuffle the
### columns around (left and right columns stay on the left and right,
### respectively, and in their original order), and is about 3x-4x faster
### than merge.data.frame()!

.do_join <- function(Ldf, Rdf, L2R)
{
    #stopifnot(is.integer(L2R),
    #          length(L2R) == nrow(Ldf),
    #          all(L2R >= 1L, na.rm=TRUE),
    #          all(L2R <= nrow(Rdf), na.rm=TRUE))
    cbind(Ldf, S4Vectors:::extract_data_frame_rows(Rdf, L2R))
}

### Performs an SQL INNER JOIN by default. Also by default the right column
### of the join ('Rcolumn') is not included in the result.
### IMPORTANT NOTE: It will mimic the behavior of an SQL JOIN (INNER or LEFT)
### **only** if the right column contains unique values! (UNIQUE constraint
### in SQL). If not, then values in the left column ('Lcolumn') will only
### get mapped to their first match in the right column ('Rcolumn').
### Comparison with merge.data.frame():
### - INNER JOIN (join_dfs is about 3x faster):
###     join_dfs(df1, df2, Lcolumn, Rcolumn)
###     merge(df1, df2, by.x=Lcolumn, by.y=Rcolumn, sort=FALSE)
### - LEFT JOIN (join_dfs is about 4x faster):
###     join_dfs(df1, df2, Lcolumn, Rcolumn, left.join=TRUE)
###     merge(df1, df2, by.x=Lcolumn, by.y=Rcolumn, sort=FALSE, all.x=TRUE)
join_dfs <- function(Ldf, Rdf, Lcolumn, Rcolumn,
                     left.join=FALSE, keep.Rcol=FALSE)
{
    stopifnot(is.data.frame(Ldf),
              is.data.frame(Rdf),
              isSingleString(Lcolumn),
              isSingleString(Rcolumn))
    ## Values in the left column only get mapped to their first match in
    ## the right column.
    L2R <- match(Ldf[ , Lcolumn], Rdf[ , Rcolumn])
    if (!left.join) {
        ## Drop rows in 'Ldf' that are not mapped.
        drop_idx <- which(is.na(L2R))
        if (length(drop_idx) != 0L) {
            Ldf <- S4Vectors:::extract_data_frame_rows(Ldf, -drop_idx)
            L2R <- L2R[-drop_idx]
        }
    }
    if (!keep.Rcol)
        Rdf <- drop_cols(Rdf, Rcolumn)
    .do_join(Ldf, Rdf, L2R)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Other stuff
###

### Uses RCurl to access and list the content of an FTP dir.
list_ftp_dir <- function(url, subdirs.only=FALSE)
{
    doc <- getURL(url)
    listing <- strsplit(doc, "\n", fixed=TRUE)[[1L]]
    if (subdirs.only)
        listing <- listing[substr(listing, 1L, 1L) == "d"]
    ## Keep field no. 8 only
    pattern <- paste(c("^", rep.int("[^[:space:]]+[[:space:]]+", 8L)),
                     collapse="")
    listing <- sub(pattern, "", listing)
    sub("[[:space:]].*$", "", listing)
}

### Provides a simpler interface to read.table().
.simple_read_table <- function(file,
                               header=FALSE, colnames=NULL, col2class=NULL,
                               nrows=-1L, skip=0L,
                               sep="\t", quote="", comment.char="")
{
    if (is.null(col2class))
        col2class <- NA
    ## Prepare args to pass to read.table().
    args <- list(file, header=header, sep=sep, quote=quote, row.names=NULL,
                 na.strings=c("NA", "na"), colClasses=col2class, nrows=nrows,
                 skip=skip, comment.char=comment.char, stringsAsFactors=FALSE)
    if (!is.null(colnames))
        args$col.names <- colnames
    do.call(read.table, args)
}

### Calling read.table() directly on an URL tends to be unreliable. For
### example getChromInfoFromNCBI("CIEA01") was randomly failing for me with
### a "line 18564 did not have 10 elements" error (exact line number would
### vary) when getChromInfoFromNCBI() was calling read.table() on the URL
### of the NCBI assembly report. In my experience, the 2-step approach
### "first download the file, then call read.table() on the local file"
### seems to be a lot more reliable! This is what fetch_table_from_url()
### does. (And getChromInfoFromNCBI() now uses fetch_table_from_url().)
### Same interface as .simple_read_table() above.
fetch_table_from_url <- function(url, ...)
{
    destfile <- tempfile()
    suppressWarnings(download.file(url, destfile, quiet=TRUE))
    ans <- .simple_read_table(destfile, ...)
    unlink(destfile)
    ans
}

### Global character vector to hold default names for circular sequences.
### This is exported!
DEFAULT_CIRC_SEQS <- c(
    ## Mitochondrial genome
    "chrM", "MT", "MtDNA", "mit", "Mito", "mitochondrion",
    "dmel_mitochondrion_genome",
    ## Chloroplast genome
    "Pltd", "ChrC", "Pt", "chloroplast", "Chloro",
    ## Plasmid (yeast)
    "2micron", "2-micron", "2uM"
)

### AFAIK UCSC doesn't flag circular sequences.
### As of Sep 21, 2010 (Ensembl release 59), Ensembl was still not flagging
### circular sequences in their db (see this thread for the details
### http://lists.ensembl.org/pipermail/dev/2010-September/000139.html),
### NOT exported but used in the GenomicFeatures package.
make_circ_flags_from_circ_seqs <- function(seqlevels, circ_seqs=NULL)
{
    if (!is.character(seqlevels))
        stop(wmsg("'seqlevels' must be a character vector"))
    if (is.null(circ_seqs)) {
        ## The user did NOT specify the 'circ_seqs' argument.
        seqlevels <- tolower(seqlevels)
        circ_seqs <- tolower(DEFAULT_CIRC_SEQS)
        circ_flags <- rep.int(NA, length(seqlevels))
        circ_flags[seqlevels %in% circ_seqs] <- TRUE
    } else {
        ## The user specified the 'circ_seqs' argument.
        if (!is.character(circ_seqs) ||
            any(circ_seqs %in% c(NA_character_, "")))
            stop(wmsg("'circ_seqs' must be a character vector with no NAs ",
                      "and no empty strings"))
        bad_circ_seqs <- setdiff(circ_seqs, seqlevels)
        if (length(bad_circ_seqs) != 0L) {
            in1string <- paste0(bad_circ_seqs, collapse=", ")
            stop(wmsg("'circ_seqs' contains unrecognized chromosome names: ",
                      in1string))
        }
        circ_flags <- seqlevels %in% circ_seqs
    }
    circ_flags
}

### Note that, strictly speaking, mergeNamedAtomicVectors() is not
### commutative, i.e., in general 'z1 <- mergeNamedAtomicVectors(x, y)' is
### not identical to 'z2 <- mergeNamedAtomicVectors(y, x)'. However 'z1' and
### 'z2' are both guaranteed to have unique names and to contain the same set
### of name/value pairs (but typically not in the same order).
mergeNamedAtomicVectors <- function(x, y, what=c("key", "values"))
{
    if (!is.atomic(x) || !is.atomic(y) || typeof(x) != typeof(y))
        stop("'x' and 'y' must be atomic vectors of the same type")
    x_names <- names(x)
    y_names <- names(y)
    if (is.null(x_names) || is.null(y_names))
        stop("'x' and 'y' must have names")
    ans_names <- union(x_names, y_names)
    if (any(ans_names %in% c(NA_character_, "")))
        stop("some names in 'x' or 'y' are NA or the empty string")
    ## Note sure why but subsetting by name is *very* slow when the character
    ## vector used as subscript contains a lot of "invalid" names. I already
    ## reported this issue twice on the R-devel mailing list (in July 2010 and
    ## May 2013) with no answer so far.
    #ans <- x[ans_names]  # very slow :-(
    ans <- x[match(ans_names, x_names)]  # much faster :-)
    ## Some of 'ans' names can be NA. This is because subsetting a named
    ## vector 'x' by a character vector 'i' returns a vector whose names
    ## are 'i' except for the values in 'i' that are not in 'names(x)'.
    ## So we need to fix this.
    names(ans) <- ans_names
    #ans2 <- y[ans_names]  # very slow :-(
    ans2 <- y[match(ans_names, y_names)]  # much faster :-)
    idx <- which(ans != ans2)
    if (length(idx) != 0L) {
        msg <- c(what[1L], ifelse(length(idx) >= 2, "s", ""), " ",
                 paste(ans_names[idx], collapse=", "), " ",
                 ifelse(length(idx) >= 2, "have", "has"),
                 " incompatible ", what[2L], ":\n  - in 'x': ",
                 paste(ans[idx], collapse=", "), "\n  - in 'y': ",
                 paste(ans2[idx], collapse=", "))
        stop(msg)
    }
    idx <- is.na(ans) & !is.na(ans2)
    ans[idx] <- ans2[idx]
    ans
}

Any scripts or data that you put into this service are public.

GenomeInfoDb documentation built on April 9, 2021, 6 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/utils.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Defines functions mergeNamedAtomicVectors make_circ_flags_from_circ_seqs fetch_table_from_url .simple_read_table list_ftp_dir join_dfs .do_join solid_match2 solid_match .one_to_many_msg rename_cols drop_cols stop_if_not_primary_key is_primary_key is_single_value

Try the GenomeInfoDb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GenomeInfoDb Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/utils.R In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

Defines functions mergeNamedAtomicVectors make_circ_flags_from_circ_seqs fetch_table_from_url .simple_read_table list_ftp_dir join_dfs .do_join solid_match2 solid_match .one_to_many_msg rename_cols drop_cols stop_if_not_primary_key is_primary_key is_single_value

Try the GenomeInfoDb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R/utils.R
In GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style