R/as.BiocAssayContainers.R

Defines functions as.SummarizedExperiment.FacileDataSet as.SummarizedExperiment.data.frame as.SummarizedExperiment as.ExpressionSet.FacileDataSet as.ExpressionSet.data.frame as.ExpressionSet as.DGEList.FacileDataSet as.DGEList.facile_frame as.DGEList.tbl as.DGEList.data.frame as.DGEList.matrix as.DGEList

Documented in as.DGEList as.DGEList.data.frame as.DGEList.FacileDataSet as.DGEList.facile_frame as.DGEList.tbl as.ExpressionSet as.ExpressionSet.data.frame as.ExpressionSet.FacileDataSet as.SummarizedExperiment as.SummarizedExperiment.data.frame as.SummarizedExperiment.FacileDataSet

#' Converts a "facile result" to a traditional Bioconductor assay container.
#'
#' An entire `FacileDataSet` or a subset of it can be converted into
#' bioconductor-standard assay containers, like a `SummarizedExperiment`,
#' `DGEList`, or `ExpressionSet` "at any time" using various `as.XXX` functions,
#' like `as.DGEList(...)`.
#'
#' We use the term "facile object" to refer to either the entirety of a
#' `FacileDataStore` or any sample-descriptor that specifies subsets of the
#' data, eg. where `fds(x)` returns a `FacileDataStore`. See examples for
#' specifics.
#'
#' Note that the order that the samples and features are materialized into the
#' expression container are not guaranteed.
#'
#' @rdname as.BiocContainer
#'
#' @export
#' @importFrom edgeR DGEList
#'
#' @param x a facile expression-like result
#' @param covariates The covariates the user wants to add to the $samples of
#'   the DGEList. This can take the following forms:
#'   - `TRUE`: All covariates are retrieved from the `FacileDataSet`
#'   - `FALSE`: TODO: Better handle FALSE
#'   - `character`: A vector of covariate names to fetch from the
#'     `FacileDataSet`. Must be elements of `names(sample_definitions(x))`
#'   - `data.frame`: A wide covariate table (dataset, sample_id, covariates ...)
#'     This may be external covariates for samples not available within
#'     `x` (yet), ie. a table of covariates provided by a third party.
#'   - `NULL`: do not decorate with *any* covariates.
#' @param feature_ids the features to get expression for (if not specified
#'   in `x` descriptor). These correspond to the elements found in the
#'   `feature_info_tbl(x)$feature_id` column.
#' @param assay_name the name of the assay matrix to use when populating the
#'   default assay matrix of the bioconductor container (the `$counts`
#'   matrix of a `DGEList`, the `exprs()` of an `ExpressionSet`, etc.).
#'   The default value is the entry provided by [default_assay()]
#' @param .fds The `FacileDataSet` that `x` was retrieved from
#' @param custom_key the custom key to use to fetch custom annotations from
#'   `.fds`
#' @return the appropriate bioconductor assay container, ie. an `edgeR::DGEList`
#'   for `as.DGEList`, a `Biobase::ExpressionSet` for `as.ExpressionSet`, or
#'   a `SummarizedExperiment::SummarizedExperiment` for
#'   `as.SummarizedExperiment`.
#'
#' @examples
#' fds <- exampleFacileDataSet()
#'
#' # Retrieve DGEList of gene expression for all samples
#' y.all <- as.DGEList(fds) # gene expression of all samples
#'
#' # Retrieve data for only 3 genes
#' # Suppose we only wanted female samples in our DGEList
#' y.fem <- fds |>
#'   filter_samples(sex == "f") |>
#'   as.DGEList() # or as.ExpressionSet()
#' @export
as.DGEList <- function(x, ...) {
  UseMethod("as.DGEList", x)
}

#' @noRd
#' @export
#' @method as.DGEList matrix
#' @rdname as.BiocContainer
#' @importFrom edgeR DGEList calcNormFactors
as.DGEList.matrix <- function(x, covariates = TRUE, feature_ids = NULL,
                              assay_name = NULL, .fds = NULL,
                              custom_key = Sys.getenv("USER"),
                              update_libsizes = !is.null(feature_ids),
                              update_normfactors = update_libsizes,
                              custom_libsizes = FALSE,
                              custom_normfactors = FALSE,
                              ...) {
  .fds <- assert_facile_data_store(.fds)
  assert_choice(assay_name, assay_names(.fds))

  ## Construct sample table from colnames of the matrix, and make sure this is
  ## legit
  samples <- tibble(
    dataset=sub('__.*$', '', colnames(x)),
    sample_id=sub('^.*?__', '', colnames(x)))

  # if you don't want to `collect` first, you could send `samples` in as
  # second argument and then copy that into the db.
  # #dboptimize

  # TODO: Start fixing here. This msethod assumes direct access to the db of
  # the .fds, which breaks the abstractions. Fix use of these methods:
  #   * feature_info_tbl()
  #   * fetch_sample_statistics() should be fetch_assay_covariates? or
  #     with_assay_covariates, because we get a wide table, maybe adding
  #     the libsize and normfactors would look like:
  #     assay_info <- with_assay_covariates(samples, assay_name)
  bad.samples <- samples |>
    anti_join(collect(samples(.fds), n = Inf),
              by=c("dataset", "sample_id")) |>
    collect(n = Inf)
  if (nrow(bad.samples)) {
    stop("Bad sample columns specified in the count matrix")
  }

  ## Fetch appropriate covariate
  if (!is.null(covariates)) {
    if (isTRUE(covariates)) {
      covariates <- fetch_sample_covariates(.fds, samples = samples)
      covariates <- spread_covariates(covariates)
    } else if (is.character(covariates)) {
      covariates <- fetch_sample_covariates(.fds, covariates = covariates,
                                            samples = samples)
      covariates <- spread_covariates(covariates)
    }
    assert_subset(c("dataset", "sample_id"), colnames(covariates))
    assert_true(nrow(covariates) == nrow(distinct(covariates)))
    covariates <- as.data.frame(covariates, stringsAsFactors = FALSE)
    rownames(covariates) <-  paste(covariates$dataset,
                                   covariates$sample_id,
                                   sep="__")
  }

  if (is(feature_ids, "data.frame") || is(feature_ids, "tbl")) {
    feature_ids <- feature_ids[["feature_id"]]
  }
  if (is.factor(features)) features <- as.character(features)
  if (!is.null(feature_ids) && is.character(feature_ids)) {
    keep <- feature_ids %in% rownames(x)
    if (mean(keep) != 1) {
      warning(sprintf("Only %d / %d feature_ids requested are in data matrix",
                      sum(keep), length(keep)))
    }
    feature_ids <- feature_ids[keep]
  } else {
    feature_ids <- rownames(x)
  }
  feature_ids <- unique(feature_ids)
  genes <- .fds |>
    features(assay_name = assay_name, feature_ids = feature_ids) |>
    as.data.frame()
  rownames(genes) <- genes[["feature_id"]]
  
  if (genes[["feature_type"]][1L] %in% c("entrez", "ensgid") || TRUE) {
    # After adding fluidigm/qPCR support, I wish I never renamed `name` to
    # `symbol`
    genes <- rename(genes, symbol = "name")
  }

  class(x) <- 'matrix'
  x <- x[feature_ids,,drop = FALSE]
  genes <- genes[feature_ids,,drop = FALSE]

  # Issue #2
  # https://github.com/denalitherapeutics/FacileData/issues/2
  sample.stats <- samples |>
    with_assay_covariates(c("libsize", "normfactor"),
                          assay_name,
                          .fds = .fds) |>
    collect(n = Inf) |>
    mutate(samid=paste(dataset, sample_id, sep='__')) |>
    as.data.frame()
  rownames(sample.stats) <- sample.stats[["samid"]]
  sample.stats <- sample.stats[colnames(x),,drop=FALSE]

  # TODO: https://github.com/facileverse/FacileData/issues/9
  # Since this is being used a general assay retrieval wrapper, what we
  # want might not even be appropriate for a DGEList (ie. negative numbers
  # inside) we hack for now, but need to address this!
  is.neg <- which(x < 0, arr.ind = TRUE)
  if (nrow(is.neg)) x[is.neg] <- x[is.neg] * -1

  y <- DGEList(x, genes = genes, lib.size = sample.stats[["libsize"]])
  # This avoids the warning when norm factors don't multiply to 1
  y$samples$norm.factors <- sample.stats[["normfactor"]]

  y$samples <- cbind(
    y$samples,
    sample.stats[, c("dataset", "sample_id", "samid"), drop = FALSE])

  if (update_libsizes) {
    y[["samples"]][["lib.size"]] <- colSums(y[["counts"]])
  }
  if (update_normfactors) {
    y <- edgeR::calcNormFactors(y)
  }

  if (!is.null(covariates)) {
    covs <- select(covariates, -dataset, -sample_id)
    covs <- covs[colnames(y),,drop=FALSE]
    if ("group" %in% colnames(covs)) {
      y[["samples"]][["group"]] <- covs[["group"]]
      covs[["group"]] <- NULL
    }
    # Remove any lib.size or norm.factors that might have come in through
    # here. Replace the ones loaded in the DGEList if the corresponding
    # `custom_*` parameter is set to `TRUE`
    if ("lib.size" %in% colnames(covs)) {
      if (custom_libsizes && is.numeric(covs[["lib.size"]])) {
        y$samples[["lib.size"]] <- covs[["lib.size"]]
      }
      covs[["lib.size"]] <- NULL
    }
    if ("norm.factors" %in% colnames(covs)) {
      if (custom_normfactors && is.numeric(covs[["norm.factors"]])) {
        y$samples[["norm.factors"]] <- covs[["norm.factors"]]
      }
      covs[["norm.factors"]] <- NULL
    }

    y$samples <- cbind(y$samples, covs)
  }

  # TODO: https://github.com/facileverse/FacileData/issues/9
  if (nrow(is.neg)) {
    y$counts[is.neg] <- y$counts[is.neg] * -1
  }

  set_fds(y, .fds)
}

#' @export
#' @method as.DGEList data.frame
#' @rdname as.BiocContainer
#' @importFrom data.table dcast set setDT
as.DGEList.data.frame <- function(x, covariates = TRUE, feature_ids = NULL,
                                  assay_name = NULL, .fds = NULL,
                                  custom_key = Sys.getenv("USER"),
                                  ...) {
  .fds <- assert_facile_data_store(.fds)
  if (is.null(assay_name)) {
    assay_name <- default_assay(.fds)
  }
  assert_choice(assay_name, assay_names(.fds))
  x <- assert_sample_subset(x)

  has.count <- 'value' %in% colnames(x) && is.integer(x[['value']])
  fetch.counts <- !has.count

  ## Do we want to fetch counts from the FacileDataSet?
  if (has.count) {
    if (is.factor(feature_ids)) feature_ids <- as.character(feature_ids)
    if (is.character(feature_ids) && !all(feature_ids %in% x[['feature_id']])) {
      fetch.counts <- TRUE
    }
    if (!missing(feature_ids) && is.null(feature_ids)) {
      ## user explicitly wants everythin
      fetch.counts <- TRUE
    }
  }

  if (fetch.counts) {
    if (has.count) {
      warning("Ignoring expression in `x` and fetching data for `feature_ids`",
              immediate.=TRUE)
    }
    ## Check that we are getting the right type of assay for this
    ainfo <- assay_info(.fds, assay_name)
    if (!ainfo$assay_type %in% c("rnaseq", "isoseq")) {
      warning("Creating DGEList for something other than rnaseq type assay")
    }
    counts <- fetch_assay_data(.fds, feature_ids, x, assay_name = assay_name,
                               normalized = FALSE, as.matrix = TRUE)
  } else {
    cdt <- assert_expression_result(x) |>
      collect(n = Inf) |>
      setDT() |>
      unique(by = c('dataset', 'sample_id', 'feature_id'))
    data.table::set(
      cdt,
      j = "samid",
      value = paste(cdt[["dataset"]], cdt[["sample_id"]], sep = "__"))
    counts <- local({
      wide <- data.table::dcast(cdt, feature_id ~ samid, value.var = "value")
      out <- as.matrix(wide[, -1L, with=FALSE])
      rownames(out) <- wide[[1L]]
      class(out) <- c('FacileExpression', class(out))
      out
    })
  }

  out <- as.DGEList.matrix(counts, covariates = covariates,
                           feature_ids = feature_ids,
                           assay_name = assay_name, .fds = .fds,
                           custom_key = custom_key, ...)

  # Add any covariates in x back to out$samples
  xref <- match(colnames(out), paste(x$dataset, x$sample_id, sep = "__"))
  cnames <- setdiff(colnames(x),
                    c("dataset", "sample_id", "lib.size", "norm.factors"))
  if (length(cnames)) {
    for (cname in cnames) out$samples[[cname]] <- x[[cname]][xref]
  }
  out
}

#' @method as.DGEList tbl
#' @export
#' @rdname as.BiocContainer
as.DGEList.tbl <- function(x, covariates = TRUE, feature_ids = NULL,
                           assay_name = NULL, .fds = NULL,
                           custom_key = Sys.getenv("USER"),
                           ...) {
  .fds <- assert_facile_data_store(.fds)
  if (is.null(assay_name)) {
    assay_name <- default_assay(.fds)
  }
  assert_choice(assay_name, assay_names(.fds))
  x <- collect(x, n = Inf)
  # NextMethod()
  as.DGEList.data.frame(x, covariates, feature_ids, assay_name, .fds = .fds,
                        custom_key = custom_key, ...)
}

#' @export
#' @rdname as.BiocContainer
as.DGEList.facile_frame <- function(x, covariates = TRUE, feature_ids = NULL,
                                    assay_name = NULL,
                                    custom_key = Sys.getenv("USER"),
                                    ...) {
  x <- collect(x, n = Inf)
  .fds <- assert_facile_data_store(fds(x))
  if (is.null(assay_name)) {
    assay_name <- default_assay(.fds)
  }
  assert_choice(assay_name, assay_names(.fds))

  has.count <- "value" %in% colnames(x) &&
    is.integer(x[["value"]]) &&
    is.character("feature_id")

  if (has.count && is.null(feature_ids)) {
    feature_ids <- unique(x[["feature_id"]])
  }

  as.DGEList.tbl(x, covariates, feature_ids, assay_name, .fds = .fds,
                 custom_key = custom_key, ...)
}


#' @export
#' @rdname as.BiocContainer
as.DGEList.FacileDataSet <- function(x, covariates = TRUE, feature_ids = NULL,
                                     assay_name = NULL,
                                     custom_key = Sys.getenv("USER"),
                                     ...) {
  xs <- samples(x)
  if (is.null(assay_name)) {
    assay_name <- default_assay(x)
  }
  assert_choice(assay_name, assay_names(x))
  as.DGEList(xs, covariates = covariates, feature_ids = feature_ids,
             assay_name = assay_name, custom_key = custom_key, ...)
}

#' @rdname as.BiocContainer
#' @export
#' @return a \code{\link[Biobase]{ExpressionSet}}
as.ExpressionSet <- function(x, ...) {
  UseMethod('as.ExpressionSet')
}

#' @rdname as.BiocContainer
#' @export
#' @method as.ExpressionSet data.frame
as.ExpressionSet.data.frame <- function(x, covariates=TRUE, feature_ids=NULL,
                                        assay_name=default_assay(.fds),
                                        .fds=fds(x), custom_key=Sys.getenv("USER"), ...) {
  .fds <- force(.fds)
  stopifnot(is.FacileDataSet(.fds))
  assert_sample_subset(x)

  ns <- tryCatch(loadNamespace("Biobase"), error = function(e) NULL)
  if (is.null(ns)) stop("Biobase required for `as.ExpressionSet`")

  y <- as.DGEList(x, covariates, feature_ids, assay_name, .fds=.fds,
                  custom_key=custom_key, ...)
  es <- ns$ExpressionSet(y$counts)
  es <- ns$`pData<-`(es, y$samples)
  es <- ns$`fData<-`(es, y$genes)
  set_fds(es, .fds)
}

#' @rdname as.BiocContainer
#' @export
#' @method as.ExpressionSet FacileDataSet
as.ExpressionSet.FacileDataSet <- function(x, covariates=TRUE, feature_ids=NULL,
                                           assay_name=default_assay(.fds),
                                           .fds=fds(x),
                                           custom_key=Sys.getenv("USER"), ...) {
  force(.fds)
  x <- samples(x) |> collect(n=Inf) |> set_fds(.fds)
  as.ExpressionSet(x, covariates, feature_ids, assay_name, x,
                   custom_key, ...)
}

#' @rdname as.BiocContainer
#' @export
#' @return a \code{\link[SummarizedExperiment]{SummarizedExperiment}}
as.SummarizedExperiment <- function(x, ...) {
  UseMethod('as.SummarizedExperiment')
}

#' @rdname as.BiocContainer
#' @export
#' @method as.SummarizedExperiment data.frame
as.SummarizedExperiment.data.frame <- function(x, covariates=TRUE, feature_ids=NULL,
                                               assay_name=default_assay(.fds),
                                               .fds=fds(x),
                                               custom_key=Sys.getenv("USER"),
                                               ...) {
  .fds <- force(.fds)
  stopifnot(is.FacileDataSet(.fds))
  assert_sample_subset(x)

  ns <- tryCatch(loadNamespace("SummarizedExperiment"), error = function(e) NULL)
  if (is.null(ns)) stop("SummarizedExperiment required for")

  y <- as.DGEList(x, covariates, feature_ids, assay_name, .fds=.fds,
                  custom_key=custom_key, ...)
  ## TODO: Check y$genes to see if we should make a rowRanges out of the
  ## rowData or just keep it as a DataFrame
  out <- ns$SummarizedExperiment(
    y$counts, colData=y$samples, rowData=y$genes, ...)
  set_fds(out, .fds)
}

#' @rdname as.BiocContainer
#' @export
#' @method as.SummarizedExperiment FacileDataSet
as.SummarizedExperiment.FacileDataSet <- function(x, covariates=TRUE, feature_ids=NULL,
                                                  assay_name=default_assay(.fds),
                                                  .fds=fds(x), custom_key=Sys.getenv("USER"),
                                                  ...) {
  force(.fds)
  x <- samples(x) |> collect(n=Inf) |> set_fds(.fds)
  as.SummarizedExperiment(x, covariates, feature_ids, assay_name, x,
                           custom_key, ...)
}
facilebio/FacileData documentation built on Feb. 23, 2024, 9:14 a.m.