R/GeneSetDb-methods.R

Defines functions .gsd.row.index as.list.GeneSetDb csplit as.data.frame.GeneSetDb as.data.table.GeneSetDb all.equal.GeneSetDb addGeneSetMetadata addCollectionMetadata hasGeneSet hasGeneSetCollection subset.GeneSetDb is.active incidenceMatrix is.conformed annotateGeneSetMembership

Documented in addCollectionMetadata addGeneSetMetadata all.equal.GeneSetDb annotateGeneSetMembership as.data.frame.GeneSetDb as.data.table.GeneSetDb hasGeneSet hasGeneSetCollection incidenceMatrix is.active is.conformed subset.GeneSetDb

#' Annotates rows of a data.frame with geneset membership from a GeneSetDb
#'
#' This is helpful when you don't have a monsterly sized GeneSetDb. There will
#' be as many new columns added to `x` as there are active genesets in `gdb`.
#'
#' @export
#'
#' @param x A data.frame with genes/features in rows
#' @param gdb A [GeneSetDb()] object with geneset membership
#' @param x.ids The name of the column in `x` that holds the feautre
#'   id's in `x` that match the feature_id's in `gdb`, or a vector
#'   of id's to use for each row in `x` that represent these.
#' @param ... parameters passed down into [incidenceMatrix()]
#' @return Returns the original `x` with additional columns: each is a
#'   logical vector that indicates membership for genesets defined in
#'   `gdb`.
#'
#' @examples
#' vm <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' mg <- seas(vm, gdb, design = vm$design, contrast = 'tumor')
#' lfc <- logFC(mg)
#' annotated <- annotateGeneSetMembership(lfc, gdb, 'feature_id')
#'
#' ## Show only genes that are part of 'HALLMARK_ANGIOGENESIS' geneset
#' angio <- subset(annotated, `c2;;BIOCARTA_AGPCR_PATHWAY`)
annotateGeneSetMembership <- function(x, gdb, x.ids=NULL, ...) {
  stopifnot(is(x, 'data.frame'))
  stopifnot(is(gdb, 'GeneSetDb'))

  if (is.null(x.ids)) {
    ## Guess the column of x that has featureIds by identifying the column of
    ## x that has the highest percent membership in gdb@featureIdMap
    membership <- sapply(x, function(col) {
      if (!is.character(col)) 0 else mean(col %in% featureIdMap(gdb)$x.id)
    })
    x.ids <- names(x)[which.max(membership)]
  }
  if (!is.character(x.ids)) {
    stop("character expected for x.ids")
  }
  if (length(x.ids) == 1L && is.character(x[[x.ids]])) {
    ## This is a column in df that has the gene IDs that `x` expects
    x.ids <- x[[x.ids]]
  }
  if (nrow(x) != length(x.ids)) {
    stop("x.ids must be a vector of gene ids, or a column name in x of these")
  }

  im <- incidenceMatrix(gdb, x.ids, ...)
  storage.mode(im) <- 'logical'
  cbind(x, t(im))
}

#' @describeIn geneSets Returns the number of genesets in a GeneSetDb
setMethod("length", "GeneSetDb", function(x) nrow(geneSets(x)))

#' (Re)-map geneset IDs to the rows in an expression object.
#'
#' @description `conform`-ing, a `GeneSetDb` to a target expression
#' object is an important step required prior to perform any type of GSEA. This
#' function maps the featureIds used in the GeneSetDb to the elements of a
#' target expression object (ie. the rows of an expression matrix, or the
#' elements of a vector of gene-level statistics).
#'
#' After `conform`-ation, each geneset in the `GeneSetDb` is flagged
#' as active (or inactive) given the number of its features that are
#' successfully mapped to `target` and the minimum and maximum number of
#' genes per geneset required as specified by the `min.gs.size` and
#' `max.gs.size` parameters, respectively.
#'
#' Only genesets that are marked with `active = TRUE` will be used in any
#' downstream gene set operations.
#'
#' @section Related Functions:
#'
#' * [unconform()]: Resets the conformation mapping.
#' * [is.conformed()]: If `to` is missing, looks for evidence that `conform` has
#'   been called (at all) on `x`. If `to` is provided, specifically checks that
#'   `x` has been conformed to the target object `to`.
#'
#' @rdname conform
#'
#' @param x The GeneSetDb
#' @param target The expression object/matrix to conform to. This could also
#'   just be a character vector of IDs.
#' @param unique.by If there are multiple rows that map to the identifiers used
#'   in the genesets, this is a means to pick the single row for that ID
#' @param min.gs.size Ensure that the genesets that make their way to the
#'   `GeneSetDb@@table` are of a minimum size
#' @param max.gs.size Ensure that the genesets that make their way to the
#'   `GeneSetDb@@table` are smaller than this size
#' @param match.tolerance Numeric value between \[0,1\]. If the fraction of
#'   `feature_id`s used in `x` that match `rownames(y)` is below
#'   this number, a warning will be fired.
#' @param ... moar args
#'
#' @return A  [GeneSetDb()] that has been matched/conformed to an expression
#'   object target `y`.
#'
#' @examples
#' es <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' head(geneSets(gdb))
#' gdb <- conform(gdb, es)
#' ## Note the updated values `active` flag, and n (the number of features
#' ## mapped per gene set)
#' head(geneSets(gdb))
setMethod("conform", c(x="GeneSetDb"),
function(x, target, unique.by=c('none', 'mean', 'var'),
         min.gs.size=2L, max.gs.size=Inf, match.tolerance=0.25, ...) {
  unique.by <- match.arg(unique.by)
  if (unique.by != 'none') {
    stop("`unique.by` must be 'none' for now")
  }
  # if (min.gs.size == 1) {
  #   stop("Doing GSEA with 1 gene doesn't really make sense, does it?")
  # }
  if (max.gs.size < min.gs.size) {
    stop("max.gs.size must be larger than min.gs.size")
  }
  ## We are allowing y to be a character vector of featureIds here
  if (is.vector(target) && is.character(target)) {
    target <- matrix(1L, nrow=length(target), ncol=1L,
                     dimnames=list(target, NULL))
  }
  if (!any(sapply(.valid.x, function(claz) is(target, claz)))) {
    stop("Illegal type of expression object to conform to")
  }
  # x <- copy(x)
  fm <- featureIdMap(x, as.dt=TRUE)
  fm$x.idx <- match(fm$x.id, rownames(target))
  fraction.match <- mean(!is.na(fm$x.idx))
  if (fraction.match <= match.tolerance) {
    warning("Fraction of gene set IDs that match rownames in the expression ",
            "object are low: ", sprintf("%.2f%% ", fraction.match * 100),
            immediate.=TRUE)
  }
  if (fraction.match == 0) {
    stop("None of the rownames of the expression object match the featureIds ",
         "for the genesets")
  }

  otable <- x@table
  ntable <- x@db[, {
    f <- feature_id
    xref <- fm[list(f)]
    n <- sum(!is.na(xref$x.idx))
    active <- n >= min.gs.size && n <= max.gs.size
    list(active=active, N=.N, n=n)
  }, by=c('collection', 'name')]

  ## Add back columns from original x@table that you just nuked
  kosher <- all.equal(
    otable[, list(collection, name)],
    ntable[, list(collection, name)])
  stopifnot(kosher)
  add.me <- setdiff(colnames(otable), colnames(ntable))
  for (col in add.me) ntable[, (col) := otable[[col]]]
  x@table <- ntable

  inactive <- !x@table$active
  if (any(inactive)) {
    msg <- paste("Deactivating %d gene sets because conformation of GeneSetDb",
                 "to the target creates gene sets smaller than %s or greater",
                 "than %s\n")
    msg <- sprintf(msg, sum(inactive), as.character(min.gs.size),
                   as.character(max.gs.size))
    warning(msg, immediate.=TRUE)
  }

  x@featureIdMap <- fm
  x
})

#' @export
#' @rdname conform
setMethod("unconform", "GeneSetDb", function(x, ...) {
  x@table <- transform(x@table, active=FALSE)
  x@featureIdMap <- transform(x@featureIdMap, x.idx=NA_integer_)
  x
})

#' @export
#' @describeIn conform Checks to see if GeneSetDb `x` is conformed to a target
#'   object `to`
#' @param to the object to test conformation to
is.conformed <- function(x, to) {
  if (!is(x, 'GeneSetDb')) {
    stop("Only works on the GeneSetDb")
  }
  if (missing(to)) {
    ans <- any(!is.na(featureIdMap(x, as.dt=TRUE)$x.idx))
  } else {
    ## Verify that gsd is properly conformed to x
    fm <- subset(featureIdMap(x, as.dt=TRUE), !is.na(x.idx))
    to.ids <- if (is.character(to))
      to
    else if (is.vector(to))
      names(to)
    else
      rownames(to)
    if (!is.character(to.ids)) {
      stop("featureIds unsuccessfully extracted from `to`")
    }
    ans <- nrow(fm) > 0 && all(to.ids[fm$x.idx] == fm$x.id)
  }
  ans
}

#' Creates a 1/0 matrix to indicate geneset membership to target object.
#'
#' Generates an inidcator matrix to indicate membership of genes (columns)
#' to gene sets (rows). If `y` is provided, then the incidence is mapped
#' across the entire feature-space of `y`.
#'
#' @export
#'
#' @param x A [GeneSetDb()]
#' @param y (optional) A target (expression) object `x` is (or can be)
#'   conformed to
#' @param ... parameters passed down into [conform()].
#' @return incidence matrix with nrows = number of genesets and columns are
#'   featureIDs. If `y` is passed in, the columns of the returned value
#'   match the rows of `y`.
#'
#' @examples
#' vm <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' im <- incidenceMatrix(gdb)
#' imv <- incidenceMatrix(gdb, vm)
incidenceMatrix <- function(x, y, ...) {
  stopifnot(is(x, 'GeneSetDb'))
  gs <- NULL
  if (missing(y)) {
    val <- 'feature_id'
    ynames <- unique(x@db$feature_id)
    ncol <- length(ynames)
  } else {
    val <- 'x.idx'
    x <- conform(x, y, ...)
    if (is.vector(y)) {
      ynames <- y
      ncol <- length(y)
    } else {
      ynames <- rownames(y)
      ncol <- nrow(y)
    }
  }

  gs <- geneSets(x, as.dt=TRUE)
  dimnames <- list(encode_gskey(gs), ynames)
  out <- matrix(0L, nrow(gs), ncol, dimnames=dimnames)

  for (i in seq_len(nrow(gs))) {
    fids <- featureIds(x, gs$collection[i], gs$name[i], value=val)
    out[i, fids] <- 1L
  }

  out
}


#' Interrogate "active" status of a given geneset.
#'
#' Returns the `active` status of genesets, which are specified by
#' their collection,name compound keys. This function is vectorized and
#' supports query of multiple gene sets at a time. If a requested
#' collection,name gene set doesn't exist, this throws an error.
#'
#' @export
#' @param x [GeneSetDb()]
#' @param i collection of geneset(s)
#' @param j name of geneset(s) (must be same length as `i`.
#' @return logical indicating if geneset is active. throws an error if
#'   any requested geneset does not exist in `x`.
#' @examples
#' dge.stats <- exampleDgeResult()
#' y <- exampleExpressionSet(do.voom = FALSE)
#' gdb <- conform(exampleGeneSetDb(), y, min.gs.size = 10)
#' # size 9 geneset:
#' geneSet(gdb, "c2", "BYSTRYKH_HEMATOPOIESIS_STEM_CELL_IL3RA")
#' is.active(gdb, "c2", "BYSTRYKH_HEMATOPOIESIS_STEM_CELL_IL3RA")
#' # geneset with >100 genes
#' is.active(gdb, "c7", "GSE3982_MAC_VS_NEUTROPHIL_LPS_STIM_DN")
is.active <- function(x, i, j) {
  stopifnot(is(x, 'GeneSetDb'))
  stopifnot(is.character(i), is.character(j), length(i) == length(j))
  gsx <- geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i,j), nomatch=NA]
  res <- gsx$active
  isna <- is.na(res)
  if (any(isna)) {
    unk <- paste(i[isna], j[isna], sep=":", collapse=",")
    stop(sprintf("Unknown genesets: %s", unk))
  }
  res
}

#' @describeIn subsetByFeatures subset GeneSetDb by feature id's
setMethod("subsetByFeatures", c(x="GeneSetDb"),
function(x, features, value=c('feature_id', 'x.id', 'x.idx'), ...) {
  value <- match.arg(value)
  ## some good old data.table voodoo going on inside here
  unk.f <- setdiff(features, featureIds(x, value=value))
  if (length(unk.f)) {
    warning(length(unk.f), "/", length(features), " do not exist in GeneSetDb")
    features <- setdiff(features, unk.f)
  }

  dat <- merge(x@db, featureIdMap(x, as.dt=TRUE), by='feature_id')
  hits <- unique(dat[dat[[value]] %in% features, list(collection, name)])
  gs.all <- geneSets(x, active.only=FALSE, as.dt=TRUE)
  keep <- rep(FALSE, nrow(gs.all))
  gs.idx <- gs.all[hits, which=TRUE]
  if (length(gs.idx)) {
    keep[gs.idx] <- TRUE
  }
  x[keep]
})

#' @rdname featureIds
setMethod("featureIds", c(x="GeneSetDb"),
function(x, i, j, value=c('feature_id', 'x.id', 'x.idx'),
         active.only=is.conformed(x), ...) {
  if (missing(value)) {
    value <- if (is.conformed(x)) 'x.id' else 'feature_id'
  }
  value <- match.arg(value, c('feature_id', 'x.id', 'x.idx'))

  if (missing(i) && missing(j)) {
    ## User isn't asking about any particular collection, but just wants all
    ## features in the GeneSetDb as a whole ... OK(?)
    fmap <- x@featureIdMap
    if (is.conformed(x) && active.only) {
      fmap <- fmap[!is.na(x.idx)]
    }
    out <- unique(fmap, by=value)[[value]]
    return(out[!is.na(out)])
  }

  if (!isSingleCharacter(i)) {
    stop("collection (i) must be length 1 character vectors")
  }
  if (missing(j)) {
    whole.collection <- TRUE
  } else {
    if (!isSingleCharacter(j)) {
      stop("gene set name (j) must be length 1 character vectors")
    }
    whole.collection <- FALSE
  }

  gs <- geneSets(x, active.only=active.only, as.dt=TRUE)
  gs <- gs[, key(gs), with=FALSE]
  gs <- gs[list(i)]

  if (nrow(gs) == 0L) {
    stop("There are no ", if (active.only) "active " else NULL,
         "genesets in collection: ", i)
  }

  if (whole.collection) {
    db <- unique(x@db[gs], by='feature_id')
  } else {
    ## I am purposefully not using `hasGeneSet` here for performance reasons
    ## hasGeneSet(x, i, j, as.error=TRUE)
    db <- x@db[list(i, j)]
    if (is.na(db$feature_id[1L])) {
      msg <- sprintf("collection=%s, name=%s does not exist in GeneSetDb db",
                     i, j)
      stop(msg)
    }
  }

  fid.map <- featureIdMap(x, as.dt=TRUE)[db$feature_id]
  if (is.conformed(x) && active.only) {
    fid.map <- fid.map[!is.na(x.idx)]
  }

  fid.map[[value]]
})

#' @describeIn featureIdMap extract featureIdMap from a GeneSetDb
#' @template asdt-param
setMethod("featureIdMap", c(x="GeneSetDb"), function(x, as.dt = FALSE) {
  out <- x@featureIdMap
  if (!as.dt) out <- setDF(copy(out))
  out
})

#' Replacing the featureIdMap blows away the "conform"-ation status of x
#'
#' This method ensures that there is only one feature_id <-> x.id mapping value.
#' Note that this does not mean that this enforces a 1:1 mapping, it's just
#' that the same mapping is not listed more than once.
#'
#' @noRd
setReplaceMethod('featureIdMap', 'GeneSetDb', function(x, value) {
  if (!is.data.frame(value)) {
    stop("data.frame/table required for featureIdMap<-")
  }
  if (!ncol(value) == 2) {
    stop("featureIdMap must be a 2 column data.frame")
  }
  if (!all(x@db$feature_id %in% value[[1]])) {
    stop("Some @db$feature_id's are not in first column of new featureIdMap")
  }

  value <- as.data.table(value)
  setnames(value, c('feature_id', 'x.id'))
  value <- unique(value, by=c('feature_id', 'x.id'))
  setkeyv(value, 'feature_id')
  value[, x.idx := NA_integer_]
  x@featureIdMap <- value
  unconform(x)
})

#' @describeIn geneSets return all genesets from a GeneSetDb
setMethod("geneSets", c(x="GeneSetDb"),
function(x, active.only=is.conformed(x), ... , as.dt=FALSE) {
  out <- if (active.only[1L]) x@table[active == TRUE] else x@table
  if (!as.dt) out <- setDF(copy(out))
  out
})

#' @rdname geneSet
#' @param collection using `i` as the parameter for "collection" isn't intuitive
#'   so if speficially set this paramter, it will replace the value for `i`.
#' @param name the same for the `collection`:`i` parameter relationship, but for
#'   `j`:`name`.
#' @examples
#' gdb <- exampleGeneSetDb()
#' geneSet(gdb, "c2", "KOMMAGANI_TP63_GAMMA_TARGETS")
#' geneSet(gdb, collection = "c2", name = "KOMMAGANI_TP63_GAMMA_TARGETS")
#' geneSet(gdb, name = "KOMMAGANI_TP63_GAMMA_TARGETS")
setMethod("geneSet", c(x="GeneSetDb"),
function(x, i, j, active.only=is.conformed(x), with.feature.map=FALSE, ...,
         collection = NULL, name = NULL, as.dt = FALSE) {
  if (!is.null(collection) && missing(i)) i <- collection
  if (!is.null(name) && missing(j)) j <- name
  if (missing(i) && is.null(collection)) {
    stopifnot(isSingleCharacter(j))
    gs <- geneSets(x, as.dt = TRUE)[name == j]
    if (nrow(gs) != 1L) {
      stop("Cannot resolve geneset using only `name` == `'", j, "'`")
    }
    i <- gs[["collection"]]
  } else if (missing(j) && is.null(name)) {
    stopifnot(isSingleCharacter(i))
    gs <- geneSets(x, as.dt = TRUE)[collection == i]
    if (nrow(gs) != 1L) {
      stop("Cannot resolve geneset using only `collection` == `'", i, "'`")
    }
    j <- gs[["name"]]
  }
  stopifnot(isSingleCharacter(i), isSingleCharacter(j))
  fids <- featureIds(x, i, j, value='feature_id', active.only=active.only, ...)
  info <- geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i, j)]
  info <- info[, c("collection", "name", "active", "N", "n"), with=FALSE]

  ## Fetch information from x@db. Extra information per feature are stored here
  dbx <- x@db[list(i,j,fids)]
  out <- cbind(info[rep(1L, nrow(dbx))], dbx[, -(1:2), with=FALSE])

  ## Add featureIdMap info
  if (with.feature.map) {
    fminfo <- featureIdMap(x, as.dt=TRUE)[list(fids)]
    out <- cbind(out, fminfo[, -1L, with=FALSE])
  }

  if (!as.dt) out <- setDF(copy(out))
  out
})

#' Subset GeneSetDb to only include specified genesets.
#'
#' This is a utility function that is called by \code{[.GeneSetDb} and is not
#' exported because it is not meant for external use.
#'
#' DEBUG: If `keep` is all FALSE, this will explode. What does an empty
#' GeneSetDb look like, anyway? Something ...
#'
#' We want to support a better, more fluent subsetting of GeneSetDb objects.
#' See Issue #12 (https://github.com/lianos/multiGSEA/issues/12)
#'
#' @param x a [GeneSetDb()]
#' @param keep logical vector as long as `nrow(geneSets(x, active.only=FALSE)`
#' @return a `GeneSetDb` that has only the results for the specified
#'   genesets.
#' @examples
#' gdb.all <- exampleGeneSetDb()
#' gs <- geneSets(gdb.all)
#' gdb <- gdb.all[gs$collection %in% c("c2", "c7")]
subset.GeneSetDb <- function(x, keep) {
  stopifnot(is(x, 'GeneSetDb'))
  if (all(keep == FALSE)) {
    stop("Cannot subset GeneSetDb down to empty (`keep` is all FALSE)")
  }
  nr <- nrow(geneSets(x, active.only=FALSE, as.dt=TRUE))

  if (!is.logical(keep) && length(keep) != nr) {
    stop("The `keep` vector is FUBAR'd")
  }

  ## 1. Remove rows from x@table
  ## 2. Remove rows in x@db that belong to collection,name that do not exist
  ##    due to (1)
  ## 3. remove entries in x@featureIdMap for features that no longer exist in
  ##    updated db from (2)
  ## 4. Update x@collectionMetadata to:
  ##    a. remove all metadata for collections that are completely gone
  ##    b. update remaining collection,count entries for remaining collections

  ## 1
  keep.table <- x@table[keep]

  ## 2
  gs.keys <- keep.table[, key(keep.table), with=FALSE]
  setkeyv(gs.keys, key(keep.table))
  keep.db <- x@db[gs.keys, nomatch=0] ## only keep entries in db in gs.keys

  ## 3
  keep.featureIdMap <- subset(x@featureIdMap, feature_id %in% keep.db$feature_id)

  ## 4a
  keep.cm <- subset(x@collectionMetadata, collection %in% keep.db$collection)
  ## 4b
  # NOTE: remove count collectionMetadata
  # cc <- keep.table[, list(name='count', value=.N), by='collection']
  # setkeyv(cc, key(keep.cm))

  ## Currently (data.table v1.9.4( there's nothing I can do to make i.value a
  ## list element and this `set` mojo doesn't work either
  # value <- i.value <- NULL # silence R CMD check NOTEs
  # suppressWarnings(keep.cm[cc, value := list(i.value)])
  ## update.idxs <- keep.cm[cc, which=TRUE]
  ## val.idx <- which(colnames(keep.cm) == 'value')
  ## for (idx in seq_along(update.idxs)) {
  ##   set(keep.cm, update.idxs[idx], val.idx, list(cc$value[idx]))
  ## }

  out <- .GeneSetDb(table=keep.table,
                    db=keep.db,
                    featureIdMap=keep.featureIdMap,
                    collectionMetadata=keep.cm)
  out
}

#' Subset whole genesets from a GeneSetDb
#' @exportMethod [
#' @param x GeneSetDb
#' @param i a logical vector as long as `nrow(geneSets(x))` indicating which
#'   geneSets to keep
#' @param j ignored
#' @param ... pass through arguments
#' @return GeneSetDb `x` with a subset of the genesets it came in with.k
#' @param drop ignored
setMethod("[", "GeneSetDb", function(x, i, j, ..., drop = FALSE) {
  if (length(list(...)) > 0) {
    stop("parameters in '...' not supported")
  }
  if (!missing(drop)) {
    warning("drop argument ignored", immediate.=TRUE)
  }
  if (!missing(j)) {
    warning("j is ignored", immediate.=TRUE)
  }
  if (!is.logical(i) && length(i) != nrow(geneSets(x))) {
    stop("i must be a logical vector as long as nrow(geneSets(x))")
  }
  subset.GeneSetDb(x, i)
})


## -----------------------------------------------------------------------------
## Functions over collections

#' Check if a collection exists in the \code{GeneSetDb}
#'
#' @export
#' @param x A [GeneSetDb()]
#' @param collection character vector of name(s) of the collections to query
#' @param as.error logical if TRUE, this will error instead of returning FALSE
#' @return logical indicating if this collection exists
#' @examples
#' gdb <- exampleGeneSetDb()
#' hasGeneSetCollection(gdb, "c2")
#' hasGeneSetCollection(gdb, "unknown collection")
hasGeneSetCollection <- function(x, collection, as.error=FALSE) {
  stopifnot(is(x, 'GeneSetDb'))
  stopifnot(is.character(collection))
  meta.idxs <- match(collection,
                     collectionMetadata(x, as.dt=TRUE)$collection)
  gsc.exists <- !is.na(meta.idxs)
  if (!all(gsc.exists) && as.error) {
    bad <- paste("    * ", collection[!gsc.exists], collapse='\n', sep='')
    stop("The following collections to not exist:\n", bad)
  }

  gsc.exists
}

#' Check to see if the GeneSetDb has a collection,name GeneSet defined
#'
#' @export
#' @param x GeneSetDb
#' @param collection character indicating the collection
#' @param name character indicating the name of the geneset
#' @param as.error If `TRUE`, a test for the existance of the geneset will throw
#'   an error if the geneset does not exist
#' @return logical indicating whether or not the geneset is defined.
#'
#' @examples
#' gdb <- exampleGeneSetDb()
#' hasGeneSet(gdb, c('c2', 'c7'), c('BIOCARTA_AGPCR_PATHWAY', 'something'))
hasGeneSet <- function(x, collection, name, as.error=FALSE) {
  gs.exists <- !is.na(.gsd.row.index(x, collection, name))
  if (as.error && !all(gs.exists)) {
    msg <- "The follwing %d gene sets do not exist: \n%s"
    not <- paste(collection[!gs.exists], name[!gs.exists],
                 sep=":", collapse="\n")
    stop(sprintf(msg, sum(!gs.exists), not))
  }
  gs.exists
}

#' @describeIn collectionMetadata Returns metadata for all collections
setMethod("collectionMetadata",
  c(x="GeneSetDb", collection="missing", name="missing"),
  function(x, collection, name, as.dt=FALSE) {
    out <- x@collectionMetadata
    if (!as.dt) {
      warning("The collectionMetadata has a list column ('value'). This ",
              "may not play well with data.frame based operation. Consider ",
              "returning as a data.table by setting `as.dt = TRUE`",
              immediate. = TRUE)
      out <- setDF(copy(out))
    }
    out
  })

#' @describeIn collectionMetadata Returns all metadata for a specific collection
setMethod("collectionMetadata",
  c(x="GeneSetDb", collection="character", name="missing"),
  function(x, collection, name, as.dt=FALSE) {
    stopifnot(isSingleCharacter(collection))
    hasGeneSetCollection(x, collection, as.error=TRUE)
    out <- x@collectionMetadata[collection]
    if (!as.dt) {
      warning("The collectionMetadata has a list column ('value'). This ",
              "may not play well with data.frame based operation. Consider ",
              "returning as a data.table by setting `as.dt = TRUE`",
              immediate. = TRUE)
      out <- setDF(copy(out))
    }
    out
  })

#' @describeIn collectionMetadata Returns the `name` metadata value for a given
#'   `collection`.
setMethod("collectionMetadata",
  c(x="GeneSetDb", collection="character", name="character"),
  function(x, collection, name, as.dt=FALSE) {
    stopifnot(isSingleCharacter(collection))
    stopifnot(isSingleCharacter(name))
    hasGeneSetCollection(x, collection, as.error=TRUE)
    .col <- collection
    .name <- name
    cmd <- x@collectionMetadata
    idx <- cmd[list(.col, .name), which=TRUE]
    if (is.na(idx[1L])) {
      msg <- sprintf("metadata not defined for collection:%s, varname:%s",
                     .col, .name)
      stop(msg)
    }
    cmd$value[[idx]]
  })

#' @describeIn collectionMetadata returns the URL for a geneset
setMethod("geneSetURL", c(x = "GeneSetDb"), function(x, i, j, ...) {
  stopifnot(is.character(i), is.character(j), length(i) == length(j))
  collections <- unique(i)
  col.exists <- hasGeneSetCollection(x, collections)
  url.fns <- Map(collections, col.exists, f = function(col, exists) {
    if (exists) {
      geneSetCollectionURLfunction(x, col)
    } else {
      function(x, y, ...) NA_character_
    }
  })
  mapply(i, j, FUN = function(col, name, ...) {
    url.fns[[col]](col, name, gdb = x, ...)
  })
})

#' @describeIn geneSetCollectionURLfunction returns the gene set collection
#'   url function from a GeneSetDb
#' @param x The GeneSetDb
#' @param i The collection to get the url function from
#' @param ... pass through arguments (not used)
#' @return the function that maps collection,name combinations to an
#'   informational URL.
#' @examples
#' gdb <- exampleGeneSetDb()
#' geneSetCollectionURLfunction(gdb, "c2", "BIOCARTA_AGPCR_PATHWAY")
setMethod("geneSetCollectionURLfunction", "GeneSetDb", function(x, i, ...) {
  stopifnot(isSingleCharacter(i))
  fn.dt <- x@collectionMetadata[list(i, 'url_function'), nomatch = 0]
  if (nrow(fn.dt) == 0) {
    ## validObject(x) : this call is slow and should never be FALSE anyway
    stop(sprintf("No url_function for collection '%s' found", i))
  }
  if (nrow(fn.dt) > 1) {
    ## validObject(x) : this call is slow and should never be FALSE anyway
    stop(sprintf("Multiple url_function defined for collection '%s' found", i))
  }

  fn <- fn.dt$value[[1L]]
  if (test_string(fn)) {
    fn <- get_function(fn)
  }
  if (!is.function(fn)) {
    ## validObject(x) : this call is slow and should never be FALSE anyway
    stop(sprintf("The URL function for collection '%s' is missing", i))
  }

  fn
})

#' @describeIn geneSetCollectionURLfunction sets the gene set collection url
#'   function for a `GeneSetDb : Collection` combination.
#' @param value the function to set as the geneset url function for the given
#'   collection `i`. This can be an actual function object, or the (string)
#'   name of the function to pull out of "the ether"
#'   (`"pkgname::functionname"` can work, too). The latter is preferred as
#'   it results in smaller serialized GeneSetDb objects.
setReplaceMethod("geneSetCollectionURLfunction", "GeneSetDb",
function(x, i, value) {
  valid <- function(v) {
    v <- get_function(v)
    if (!isTRUE(is.function(v))) return(FALSE)
    args <- formalArgs(v)
    # We didn't specify previously that the  names of the arguments had to be
    # collection and name, so don't check those. Just require >= 3, and that
    # one of them is "...
    if (length(args) < 3 || !"..." %in% args) {
      warning("geneSetURL functions must take at least two named arguments, ",
              "and must also handle `...`")
      return(FALSE)
    }
    url.test <- tryCatch(v("a", "b", x), error = function(e) NULL)
    if (!test_string(url.test)) {
      warning("geneSetURL does not return a string when called")
      return(FALSE)
    }
    TRUE
  }
  # We explicitly strip the calling environment associated with the function
  # as this can result in huge objects. Sometimes saveRDS'ing small GeneSetDb
  # objects resulted in multi 100's of MB's worth of an *.rds file
  #
  # If the function's envirnment has a `.fn.local.vars` character vector, then
  # only those arguments are kept in the environment, otherwise we only keep
  # objects that are character vectors
  if (is.function(value)) {
    warning("Storing actual functions as a url_function inflates serialized ",
            "consider storing the name of the function instead")
    environment(value) <- new.env(parent = parent.env(environment(value)))
  }
  added <- addCollectionMetadata(x, i, 'url_function', value, valid)
  added
})

#' @describeIn collectionMetadata sets the feature id type for a collection
setReplaceMethod("featureIdType", "GeneSetDb", function(x, i, value) {
  valid <- function(v) is(v, 'GeneIdentifierType')
  addCollectionMetadata(x, i, 'id_type', value, valid)
})

#' @describeIn collectionMetadata retrieves the feature id type for a collection
setMethod("featureIdType", "GeneSetDb", function(x, i, ...) {
  x@collectionMetadata[list(i, 'id_type')]$value[[1L]]
})

#' @section Adding arbitrary collectionMetadata:
#'
#' Adds arbitrary metadata to a gene set collection of a GeneSetDb
#'
#' Note that this is not a replacement method! You must catch the returned
#' object to keep the one with the updated `collectionMetadata`. Although this
#' function is exported, I imagine this being used mostly through predefined
#' replace methods that use this as a utility function, such as the replacement
#' methods `featureIdType<-`, `geneSetURLfunction<-`, etc.
#'
#' ```
#' gdb <- getMSigGeneSetDb('H')
#' gdb <- addCollectionMetadata(gdb, 'H', 'foo', 'bar')
#' ```
#'
#' @export
#' @rdname collectionMetadata
#'
#' @param x [GeneSetDb()]
#' @param xcoll The collection name
#' @param xname The name of the metadata variable
#' @param value The value of the metadata variable
#' @param validate.value.fn If a function is provided, it is run on
#'   `value` and msut return `TRUE` for addition to be made
#' @param allow.add If `FALSE`, this xcoll,xname should be in the
#'   `GeneSetDb` already, and this will fail because something is
#'   deeply wrong with the world
#' @return The updated `GeneSetDb`.
addCollectionMetadata <- function(x, xcoll, xname, value,
                                  validate.value.fn=NULL, allow.add=TRUE) {
  stopifnot(is(x, 'GeneSetDb'))
  stopifnot(is.character(xcoll))
  if (!hasGeneSetCollection(x, xcoll)) {
    stop("GeneSetDb does not have collection: ", xcoll)
  }
  if (is.function(validate.value.fn)) {
    if (!isTRUE(validate.value.fn(value))) {
      stop(sprintf("Invalid value used to update %s,%s", xcoll, xname))
    }
  }
  # update or replace
  idx <- x@collectionMetadata[list(xcoll, xname), which=TRUE]
  if (is.na(idx)) {
    if (!allow.add) {
      msg <- sprintf("%s,%s metadata does not exist in the GeneSetDb, but it",
                     "should be there. Your GeneSetDb is hosed")
      stop(msg)
    }

    # the variable you want to enter here is not there yet, so we create an
    # empty, single-row data.table that will be added to the current metadata
    add.me <- data.table(
      collection = xcoll,
      name = xname,
      value = list(value))
    cm <- rbind(x@collectionMetadata, add.me)
    setkeyv(cm, key(x@collectionMetadata))
    x@collectionMetadata <- cm
  } else {
    # Need to use list(list()) because data.table uses list(.) to look for
    # values to assign to columns by reference.
    # https://stackoverflow.com/a/22536321/83761
    set(x@collectionMetadata, i = idx, j = "value", value = list(list(value)))
  }
  x
}

#' Add metadata at the geneset level.
#'
#' This function adds/updates columns entries in the `geneSets(gdb)` table.
#' If there already are defined meta values for the columns of `meta` in `x`,
#' these will be updated with the values in `meta`.
#'
#' TODO: should this be a setReplaceMethod, Issue #13 (?)
#' https://github.com/lianos/multiGSEA/issues/13
#'
#' @export
#'
#' @param x a `GeneSetDb` object
#' @param meta a `data.frame`-like object with `"collection"`, `"name"`, and
#'   an arbitrary amount of columns to add as metadata for the genesets.
#' @param ... not used yet
#' @return the updated `GeneSetDb` object `x`.
#' @examples
#' gdb <- exampleGeneSetDb()
#' meta.info <- transform(
#'   geneSets(gdb)[, c("collection", "name")],
#'   someinfo = sample(c("one", "two"), nrow(gdb), replace = TRUE))
#' gdb <- addGeneSetMetadata(gdb, meta.info)
addGeneSetMetadata <- function(x, meta, ...) {
  stopifnot(is(x, "GeneSetDb"))
  k <- key(x@table)
  stopifnot(is(meta, "data.frame"))
  stopifnot(all(k %in% colnames(meta)))
  special <- c("N", "n", "active")
  special <- intersect(special, colnames(meta))
  if (length(special)) {
    warning("Ignoring the following protected columns in the meta table:\n",
            paste(special, collapse = ","), immediate. = TRUE)
  }

  meta <- as.data.table(meta)
  setkeyv(meta, k)

  mdt <- unique(meta, by = k)
  if (nrow(mdt) != nrow(meta)) {
    stop("You have duplicate collection,name entries in the updated `meta` ",
         "data.frame. This is currently not allowed.")
  }

  # handle non std eval NOTE in R CMD check when using `:=` mojo
  .idx. <- NULL
  xtable <- copy(x@table)[, .idx. := 1:.N]
  xref <- xtable[mdt]

  bad.meta <- is.na(xref$N)
  if (any(bad.meta)) {
    warning(sum(bad.meta), " unknown gene sets in the meta update, ignoring.",
            immediate. = TRUE)
    xref <- xref[!bad.meta]
    meta <- meta[!bad.meta]
  }

  mcols <- setdiff(colnames(meta), c(k, special))
  if (length(mcols)) {
    idxs <- xref[[".idx."]]
    for (mc in mcols) {
      vals <- meta[[mc]]
      xtable[idxs, (mc) := vals]
    }
  }

  stopifnot(
    all.equal(xtable[, list(collection, name)], x@table[, list(collection, name)]),
    all.equal(xtable$N, x@table$N))
  xtable[[".idx."]] <- NULL
  x@table <- xtable
  x
}

#' Combines two GeneSetDb objects together
#'
#' @importMethodsFrom BiocGenerics combine
#' @exportMethod combine
#' @param x a GeneSetDb object
#' @param y a GeneSetDb object
#' @param ... more things
#' @return a new GeneSetDb that contains all genesets from `x` and `y`
#' @examples
#' gdb1 <- exampleGeneSetDb()
#' gdb2 <- GeneSetDb(exampleGeneSetDF())
#' gdb <- combine(gdb1, gdb2)
setMethod("combine", c(x = "GeneSetDb", y = "GeneSetDb"), function(x, y, ...) {

  ## Combine the db and featureIdMap(s)
  db <- rbindlist(list(x@db, y@db), use.names=TRUE, fill=TRUE)
  db <- unique(db, by=c('collection', 'name', 'feature_id'))
  db <- setkeyv(db, key(x@db))

  fms <- list(featureIdMap(x, as.dt=TRUE), featureIdMap(y, as.dt=TRUE))
  fm <- rbindlist(fms, use.names=TRUE, fill=TRUE)
  # ensure that a feature_id entry maps to only one x.id entry
  # DEBUG: Is this uniquification necessary?
  fm <- unique(fm, by=c('feature_id', 'x.id'))
  fm[, x.idx := NA_integer_]  ## blow out any `conform`-ation information
  setkeyv(fm, 'feature_id')

  cmeta <- rbind(x@collectionMetadata, y@collectionMetadata)
  cmeta <- unique(cmeta, by=key(x@collectionMetadata))
  setkeyv(cmeta, key(x@collectionMetadata))

  out <- .GeneSetDb(db=db, featureIdMap=fm, table=init.gsd.table.from.db(db),
                    collectionMetadata=cmeta)

  # Transfer over any extra metadata (columns) of the @table slots from
  # the two inputs incase the user stored extra data at the geneset level
  # in them.
  add.gs.cols <- setdiff(
    c(names(x@table), names(y@table)),
    names(out@table))
  if (length(add.gs.cols)) {
    gs.keys <- key(out@table)
    # If there were duplicate collection,name entries among the two GeneSetDb
    # objects, let's pull them out here. If both share the same extra meta
    # data names, x will win.
    x.cols <- c(gs.keys, intersect(add.gs.cols, names(x@table)))
    y.cols <- c(gs.keys, intersect(add.gs.cols, names(y@table)))
    duped <- merge(
      x@table[, x.cols, with = FALSE],
      y@table[, y.cols, with = FALSE],
      by = gs.keys,
      suffixes = c("", ".axeme"))

    # now just mash together the rest.
    gs <- rbindlist(list(x@table, y@table), use.names = TRUE, fill = TRUE)
    gs <- gs[, union(x.cols, y.cols), with = FALSE]
    setkeyv(gs, gs.keys)
    if (nrow(duped)) {
      # duplicate collection,name gene sets in x and y create repeated rows
      # in gs. Those are removed and replaced with `duped`
      duped <- duped[, !grepl(".axeme", names(duped)), with = FALSE]
      gs <- rbindlist(list(duped, gs[!duped]), use.names = TRUE, fill = TRUE)
      setkeyv(gs, gs.keys)
    }
    out@table <- merge(out@table, gs, by = gs.keys, all.x = TRUE)
  }

  out
})

#' @importMethodsFrom BiocGenerics nrow
#' @exportMethod nrow
#' @describeIn geneSets return number of genesets in GeneSetDb
setMethod("nrow", "GeneSetDb", function(x) nrow(geneSets(x, as.dt=TRUE)))

#' Checks equality (feature parity) between GeneSetDb objects
#'
#' @export
#'
#' @method all.equal GeneSetDb
#' @param target The reference `GeneSetDb` to compare against
#' @param current The `GeneSetDb` you wan to compare
#' @param features.only Only compare the "core" columns of `target@db`
#'   and `target@@table`. It is possible that you added additional columns
#'   (to keep track of symbols in `target@@db`, for instance) that you
#'   want to ignore for the purposes of the equality test.
#' @param ... moar args.
#' @return `TRUE` if equal, or \code{character} vector of messages if not.
all.equal.GeneSetDb <- function(target, current, features.only = TRUE, ...) {
  stopifnot(is(target, 'GeneSetDb'))
  stopifnot(is(current, 'GeneSetDb'))

  msg <- TRUE

  dbt <- setkeyv(copy(target@db), c('collection', 'name', 'feature_id'))
  gst <- geneSets(target, active.only=FALSE, as.dt=TRUE)

  dbc <- setkeyv(copy(current@db), key(dbt))
  gsc <- geneSets(current, active.only=FALSE, as.dt=TRUE)

  proto <- new("GeneSetDb")
  if (features.only) {
    dbt <- dbt[, names(proto@db), with=FALSE]
    dbc <- dbc[, names(proto@db), with=FALSE]
    gst <- gst[, names(geneSets(proto, as.dt=TRUE)), with=FALSE]
    gsc <- gsc[, names(gst), with=FALSE]
  }

  msg <- all.equal(dbt, dbc)
  if (!isTRUE(msg)) {
    return(msg)
  }

  msg <- all.equal(gst, gsc)
  if (!isTRUE(msg) || features.only) {
    return(msg)
  }

  all.equal(collectionMetadata(target, as.dt=TRUE),
            collectionMetadata(current, as.dt=TRUE))
}

# Conversion Methods -----------------------------------------------------------

#' Convert a GeneSetDb to other formats.
#'
#' @description
#' As awesome as a GeneSetDb is, you might find a time when you'll need your
#' gene set information in an other format. To do that, we provide the
#' following functions:
#'
#' * `as(gdb, "BiocSetf')`: convert to a [BiocSet::BiocSet()].
#' * `as(gdb, "GeneSetCollection")`: Convert to a
#'   [GSEABase::GeneSetCollection()] object.
#' * `as.data.(table|frame)(gdb)`: Perhaps the most natural format to convert to in
#'   order to save locally and examine outside of Bioconductor's GSEA universe,
#'   but not many other tools accept gene set definitions in this format.
#' * `as.list(gdb)`: A named list of feature identifiers. This is the format
#'   that many of the limma gene set testing methods use
#'
#' @details
#' The `as.*` functions accept a `value` parameter which indicates the type of
#' IDs you want to export in the conversion:
#'
#' * `"feature_id"`: The ID used as originally entered into the `GeneSetDb`.
#' * `"x.idx"`: Only valid if the GeneSetDb `x` has been `conform`-ed to an
#'   expession container. This option will export the features as the integer
#'   rows of the expression container.
#' * `"x.id"`: Only valid if the GeneSetDb `x` has been `conform`-ed. The
#'   target expression container might use feature identifiers that are
#'   different than what is in the GeneSetDb. If an active featureMap is
#'   set on the GeneSetDb, this will convert the original feature identifiers
#'   into a different target space (entrez to ensembl, for instance). Using
#'   this option, the features will be provided in the target space.
#'
#' @export
#' @rdname conversion
#' @name conversion
#' @aliases as.data.frame as.list as.data.table
#' @method as.data.table GeneSetDb
#'
#' @param x A `GeneSetDb` object
#' @param keep.rownames included here just for consistency with
#'   `data.table::as.data.table`, but it is not used
#' @param value The value type to export for the feature ids, defaults to
#'   `"feature_id"`.
#' @param active.only If the `GeneSetDb` is conformed, do you want to only
#'   return the features and genests that match target and are "active"?
#' @param ... pass through arguments (not used)
#' @return a converted `GeneSetDb`
#'
#' @examples
#' es <- exampleExpressionSet()
#' gdb <- conform(exampleGeneSetDb(), es)
#' bs <- as(gdb, "BiocSet")
#' gdf <- as.data.frame(gdb)
#' gdb <- conform(gdb, es)
#' gdfi <- as.data.frame(gdb, value = 'x.idx')
#' gdl <- as.list(gdb)
as.data.table.GeneSetDb <- function(x, keep.rownames = FALSE,
                                    value = c('feature_id', 'x.id', 'x.idx'),
                                    active.only = is.conformed(x), ...) {
  stopifnot(is(x, 'GeneSetDb'))
  value <- match.arg(value)
  if (!is.conformed(x) && value %in% c('x.id', 'x.idx')) {
    stop("must use value='feature_id' for non-conformed GeneSetDb'")
  }

  fid.map <- featureIdMap(x, as.dt=TRUE)
  if (is.conformed(x) && active.only) {
    # x.idx <- NULL # silence R CMD check NOTEs
    fid.map <- fid.map[!is.na(x.idx)]
  }

  gs <- copy(geneSets(x, active.only=active.only, as.dt=TRUE))

  gene2cat <- merge(x@db, fid.map, by='feature_id')
  gene2cat <- gene2cat[!is.na(gene2cat[[value]]),]
  gene2cat$finalId <- gene2cat[[value]]

  # collection <- name <- finalId <- feature_id <- NULL # silence R CMD check NOTEs
  out <- gene2cat[, list(collection, name, feature_id=finalId)]

  more.cols <- setdiff(names(gene2cat),
                       c(names(out), names(fid.map), 'finalId'))
  if (length(more.cols)) {
    for (col in more.cols) {
      out[, (col) := gene2cat[[col]]]
    }
  }

  setkeyv(out, key(x@db))
  out[gs[, list(collection, name)]]
}

#' @describeIn conversion convert a GeneSetDb to data.frame
#' @export
#' @method as.data.frame GeneSetDb
#' @param row.names,optional included here for consistency with `as.data.frame`
#'   generic function definition, but these are not used.
as.data.frame.GeneSetDb <- function(x, row.names = NULL, optional = NULL,
                                    value = c('feature_id', 'x.id', 'x.idx'),
                                    active.only = is.conformed(x), ...) {
  value <- match.arg(value)
  setDF(as.data.table(x, value = value, active.only = active.only, ...))
}

#' Split and conserve ordering
#'
#' Using base::split will turn f into a factor and won't preserve the ordering
#' of the elements in f. This function preserves the split order in f
#'
#' @noRd
csplit <- function(x, f) {
  f <- as.character(f)
  ff <- factor(f, levels=unique(f))
  split(x, ff)
}

#' @noRd
#' @method as.list GeneSetDb
#' @export
as.list.GeneSetDb <- function(x, value = c('feature_id', 'x.id', 'x.idx'),
                              active.only = is.conformed(x), nested = FALSE,
                              ...) {
  stopifnot(is(x, 'GeneSetDb'))
  value <- match.arg(value)
  df <- as.data.frame(x, value = value, active.only = active.only)
  if (nested) {
    colls <- unique(df$collection)
    ## Using the "naive split" call converts xdf$name to a factor and doesn't
    ## preserve ordering
    out <- sapply(colls, function(coll) {
      xdf <- df[df[['collection']] == coll,,drop=FALSE]
      csplit(xdf$feature_id, xdf$name)
    }, simplify=FALSE)
  } else {
    df$key <- encode_gskey(df)
    out <- csplit(df$feature_id, df$key)
  }
  out
}

#' @noRd
#' @importClassesFrom GSEABase GeneSetCollection
#' @importFrom GSEABase GeneSetCollection GeneSet NullIdentifier
setAs("GeneSetDb", "GeneSetCollection", function(from) {
  gs <- geneSets(from, as.dt=TRUE)
  n.coll <- length(unique(gs$collection))

  ## We explicitly set the key type after subsetting here in the event that
  ## a 0 row data.table is returned -- this isn't keyed in 1.9.4, which seems
  ## like a bug
  cmd <- collectionMetadata(from, as.dt=TRUE)
  org <- subset(cmd, name == 'organism')
  id.type <- subset(cmd, name == 'id_type')
  setkeyv(id.type, 'collection')
  setkeyv(org, 'collection')

  gsl <- lapply(seq_len(nrow(gs)), function(i) {
    name <- gs$name[i]
    coll <- gs$collection[i]
    idt <- id.type[coll]$value[[1]]
    if (!is(idt, 'GeneIdentifierType')) {
      idt <- NullIdentifier()
    }
    ids <- featureIds(from, coll, name, 'feature_id')
    xorg <- org[coll]$value[[1]]
    if (is.null(xorg)) {
      xorg <- ""
    }
    set.name <- name
    if (n.coll > 1L) {
      set.name <- paste0(coll, ';', set.name)
    }
    GeneSet(ids, setName=set.name, geneIdType=idt, organism=xorg)
  })
  gsc <- GeneSetCollection(gsl)
  gsc
})

#' @noRd
#' @importClassesFrom BiocSet BiocSet
setAs("GeneSetDb", "BiocSet", function(from) {
  # we can transfer a lot of things over, like the geneSetURL's and other
  # collectionMetadata, but for now let's just get the basics going

  # Append collectio names or no? The BiocSet doesn't have this concept, and
  # primary keys there are just the set names.
  #
  # Let's check if the geneset names only are unique, in which case we won't
  # include the collection prefix, otherwise we must.
  rm.prefix <- !any(duplicated(geneSets(from)$name))
  gs.list <- as.list(from)
  if (rm.prefix) {
    names(gs.list) <- sub(".*?;;", "", names(gs.list))
  }
  BiocSet::BiocSet(gs.list)
})

## -----------------------------------------------------------------------------
## indexing

#' Identify the row number of the geneset being queried.
#'
#' The method only allows querying by single length character vectors
#'
#' @noRd
#'
#' @param x A GeneSetDb
#' @param i the collection of the geneset
#' @param j the name of the geneset
#'
#' @return The row index of the geneset under question. If i,j do not match
#' a given geneset, then NA is returned.
.gsd.row.index <- function(x, i, j) {
  stopifnot(is(x, 'GeneSetDb'), is.character(i), is.character(j))
  geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i, j), which=TRUE]
}
lianos/sparrow documentation built on Feb. 5, 2024, 2:58 p.m.