## The code in here builds FacileDataSets from Bioconductor like objects
#' Create an empty FacileDataSet
#'
#' This is a helper function that is currently only called from
#' `as.FacileDataSet`
#'
#' @importFrom rhdf5 h5createFile h5createGroup H5close
#' @importFrom DBI dbConnect dbDisconnect dbGetQuery
#' @importFrom RSQLite SQLite
#'
#'
#' @param path the directory to create which will house the
#' \code{FacileDataSet}
#' @param covariate_definition the path to the covariate definition file
#' @param page_size,cache_size \code{pragma} values to setup the backend SQLite
#' database
#' @return inivisibly returns the path to the successfully created datastore
initializeFacileDataSet <- function(path, meta_file,
page_size=2**12, cache_size=2e5) {
assert_valid_meta_file(meta_file)
path <- normalizePath(path, mustWork=FALSE)
if (file.exists(path) || dir.exists(path)) {
stop("Collision with file: ", path)
}
parent.dir <- dirname(path)
assert_directory_exists(parent.dir)
assert_access(parent.dir, 'w') ## check parent directory is writeable
dir.create(path)
dir.create(file.path(path, 'custom-annotation'))
# create dummy file in custom-annotation directory for source control and
# s3 synching of empty directories
writeLines("Empty placeholder for synching empty directories",
file.path(path, 'custom-annotation', 'README.txt'))
## file.copy(covariate_definition, file.path(path, 'sample-covariate-info.yaml'))
file.copy(meta_file, file.path(path, 'meta.yaml'))
## Create sqlite db
db.fn <- file.path(path, 'data.sqlite')
con <- dbConnect(SQLite(), db.fn)
on.exit(dbDisconnect(con))
dbExecute(con, 'pragma temp_store=MEMORY;')
dbExecute(con, sprintf('pragma page_size=%d', page_size))
dbExecute(con, sprintf('pragma cache_size=%d;', cache_size))
sql.fn <- system.file('extdata', 'init', 'faciledataset.sql',
package='FacileData')
db.sql <- sqlFromFile(sql.fn)
executeSQL(con, db.sql)
## Create empty HDF5 file
hd5.fn <- file.path(path, 'data.h5')
h5createFile(hd5.fn)
h5createGroup(hd5.fn, 'assay')
invisible(path)
}
#' @export
#' @importFrom tools file_ext
assert_valid_meta_file <- function(fn, as.list = FALSE) {
assert_file(fn)
if (!tolower(file_ext(fn)) == 'yaml') {
stop("meta file must be a yaml file")
}
dat <- yaml.load_file(fn)
req.toplevel <- c('name', 'organism', 'datasets', 'sample_covariates',
'default_assay')
miss.toplevel <- setdiff(req.toplevel, names(dat))
if (length(miss.toplevel)) {
stop("Missing the following definitions in meta file: ",
paste(miss.toplevel, collapse=","))
}
if (as.list) dat else fn
}
.feature.types <- c('entrez', 'ensgid', 'enstid', 'custom')
.assay.types <- c(
"rnaseq", "pseudobulk", "isoseq", "normcounts", "nanostring",
"qpcrct", "qpcrdct", "lognorm", "raw")
.storage.modes <- c('integer', 'numeric')
supported.assay.container <- function(x) {
supported <- c("DGEList", "EList", "eSet", "SummarizedExperiment", "matrix")
any(is(x) %in% supported)
}
extract.assay <- function(x, assay_name=NULL) {
if (is(x, 'DGEList')) {
out <- x$counts
} else if (is(x, "EList")) {
out <- x$E
} else if (is(x, 'eSet')) {
ns <- loadNamespace("Biobase")
if (is.null(assay_name)) assay_name <- assayDataElementNames(x)[1]
out <- ns$assayDataElement(x, assay_name)
} else if (is(x, 'SummarizedExperiment')) {
ns <- loadNamespace("SummarizedExperiment")
out <- if (is.null(assay_name)) {
ns$assays(x)[[1L]]
} else {
ns$assay(x, assay_name)
}
} else if (is(x, 'matrix')) {
out <- x
} else {
stop("Unsupported assay container: ", class(x)[1L])
}
stopifnot(is(out, 'matrix'),
nrow(out) == nrow(x),
ncol(out) == ncol(x),
all.equal(rownames(out), rownames(x)),
all.equal(colnames(out), colnames(x)))
out
}
assert_valid_assay_datasets <- function(datasets, facile_feature_info,
storage_mode, assay_name=NULL) {
supported <- sapply(datasets, supported.assay.container)
if (any(!supported)) {
idxs <- which(!supported)
bad <- sapply(datasets[idxs], function(d) class(d)[1L]) |> unique()
stop("Unsupported assay container(s):", paste(bad, collapse=","))
}
dnames <- names(datasets)
if (is.null(dnames) ||
all(dnames != make.names(dnames)) ||
length(unique(dnames)) != length(dnames)) {
# stop("names(datasets) must be valid varnames and all unique")
warning("names(datasets) are not valid R variable names", immediate. = TRUE)
}
smodes <- sapply(datasets, function(d) {
class(extract.assay(d, assay_name)[1L])
})
smodes <- unique(smodes)
if (length(smodes) != 1L) {
stop("More than one data type across dataset: ",
paste(smodes, collapse=','))
}
stopifnot(smodes == storage_mode)
## Should we enforce valid variable names in sample columns?
## biobroom says yes, but for now we just force that first character is a
## letter
valid.colnames <- sapply(datasets, function(d) {
all(substring(colnames(d), 1, 1) == make.names(substring(colnames(d), 1, 1)))
})
bad.ds <- which(!valid.colnames)
if (length(bad.ds)) {
stop("colnames(ds) should be valid variable names, offending entries:\n",
paste(bad.ds, collapse=","))
}
## Check that there are no duplicate rownames() in assays and that they all
## have measurements for the same features
fids <- rownames(datasets[[1]])
if (any(duplicated(fids))) stop("Duplicated rownames exist")
features.consistent <- sapply(datasets, function(d) {
nrow(d) == length(fids) && setequal(rownames(d), fids)
})
stopifnot(all(features.consistent))
## Check that we have feature_info entries for all features in assays
ffi.equal <- setequal(facile_feature_info$feature_id, fids)
if (!ffi.equal) {
stop("facile_feature_info$feature_id is not seteqaul to datasets")
}
TRUE
}
#' Adds new set of assay data for all samples in a FacileDataSet
#'
#' Once a FacileDataSet has been created and initialized, either via a
#' low-level call to [initializeFacileDataSet()], or a call to
#' [as.FacileDataSet()] over a list of BiocAssayContainers, you can add more
#' assays (i.e. RNA-seq, microarray, etc.) to the FacileDataSet using this
#' function.
#'
#' Note that you cannot add assay data piecemeal. That is to say, you can not call
#' this function once to add copynumber data
#' (addFacileAssaySet(..., facile_assay_type = "cnv") to a subset of samples
#' and later call this function again to add copynumber to the rest of the
#' samples. The function will throw an error if
#' facile_assay_type %in% assay_names(x) == TRUE.
#'
#' @md
#' @importFrom rhdf5 h5createFile h5createDataset h5write
#' @export
#'
#' @param x The `FacileDataSet`
#' @param datasets list of `ExpressionSet`, `SummarizedExperiment`, or
#' `DGEList`s that have the new assay data across all of the datasets in `x`.
#' @param facile_assay_name the name of the assay in the source dataset object
#' @param facile_assay_type string indicating the assay_type ('rnaseq',
#' 'affymetrix', etc.)
#' @param facile_feature_type a string indicating the universe the features in
#' this can be anything, but certain identifiers are special like `"entrez"`,
#' `"ensgid"`, `"enstid"`, etc.
#' @param facie_assay_description a string that allows the caller to provide
#' a "freeform" description of the assay (platform, protocol, whatever).
#' @param facile_feature_info a `data.frame` with the required `feature_info`
#' columns that describe the features in this assay. Please refer to the
#' "Features" section of the `FacileDataSet` vignette for more complete
#' description.
#' @param storage_mode either `"integer"` or `"numeric"`, maps to the
#' `storage.mode` parameter in [rhdf5::h5createDataset()]
#' @param chunk_rows the first entry in the `chunk` parameter in
#' [rhdf5::h5createDataset()] (`integer`)
#' @param chunk_cols the second entry in the `chunk` parameter in
#' [rhdf5::h5createDataset()]. If this is `"ncol"`, it is set to the number
#' of columns in each of the internal dataset matrices being added.
#' @param chunk_compression the `level` parameter in [rhdf5::h5createDataset()]
#' @param assay_name the assay name in the data containers provided in the
#' `datasets` list.
#' @return a `tibble` subset of `facile_feature_info` that indicates the *new*
#' features that were added to the internal `feature_info_tbl`.
#' @importFrom edgeR calcNormFactors
addFacileAssaySet <- function(x, datasets, facile_assay_name,
facile_assay_type,
facile_feature_type = NULL,
facile_assay_description = NULL,
facile_feature_info,
storage_mode = .storage.modes,
chunk_rows = 5000, chunk_cols = "ncol",
chunk_compression = 4,
assay_name = NULL, warn_existing = FALSE,
add_sample_covariates = TRUE,
covariate_def = NULL) {
## Parameter Checking --------------------------------------------------------
stopifnot(is.FacileDataSet(x))
assert_string(facile_assay_name)
assert_string(facile_assay_type)
if (facile_assay_name %in% assay_names(x)) {
stop("`", facile_assay_name, "` assay already stored in FacileDataSet")
}
ainfo <- assay_info(x)
if (facile_assay_type %in% ainfo$assay_type) {
warning("assay exists of this type already: ", facile_assay_type,
immediate.=TRUE)
}
facile_feature_type <- assert_string(facile_feature_type)
if (is.null(facile_assay_description)) {
facile_assay_description <- facile_assay_type
}
assert_string(facile_assay_description)
storage_mode <- match.arg(storage_mode, .storage.modes)
assert_valid_assay_datasets(datasets, facile_feature_info, storage_mode, assay_name)
## Ensure no redunancy in facile_feature_info
nf <- facile_feature_info |>
distinct(feature_type, feature_id) |>
nrow()
if (nrow(facile_feature_info) != nf) {
stop("Redunant features in facile_feature_info")
}
## Insert entry into assay_info table ----------------------------------------
ai <- tibble(assay=facile_assay_name,
assay_type=facile_assay_type,
feature_type=facile_feature_type,
description=facile_assay_description,
nfeatures=nrow(datasets[[1]]),
storage_mode=storage_mode) |>
append_facile_table(x, "assay_info", warn_existing = warn_existing)
## Insert Feature Information into FacileDataSet -----------------------------
## Insert new features into global feature_info table
features <- x |>
append_facile_feature_info(facile_feature_info,
type = facile_feature_type,
warn_existing = warn_existing) |>
select(feature_type, feature_id, added)
## Create entries in `assay_feature_info` table to track hdf5 indices for
## the features in this assay
afi <- feature_info_tbl(x) |>
semi_join(select(features, -added),
by = c('feature_type', 'feature_id'),
copy = TRUE, auto_index = TRUE) |>
collect(n=Inf)
afi <- transmute(afi, assay = facile_assay_name, feature_id,
hdf5_index = seq(nrow(afi))) |>
append_facile_table(x, "assay_feature_info", warn_existing) |>
arrange(hdf5_index)
stopifnot(nf == nrow(afi), all(afi$hdf5_index == seq(nf)))
## Insert assay data and track sample info -----------------------------------
aname <- paste0('assay/', facile_assay_name)
stopifnot(h5createGroup(hdf5fn(x), aname))
## loop over datasets and insert one-by-one
## Mash it up all together (memory intensive!) to create norm.factor
## this also checks that rows are consistent across datasets (again) and
## returns them in the right order
dats <- lapply(datasets, function(dat) {
dat <- extract.assay(dat, assay_name)
if (!setequal(rownames(dat), afi$feature_id)) {
stop("Mismatch in rownames(", ds, ") and feature_id's in database")
}
dat[afi$feature_id,,drop=FALSE]
})
## If rnaseq, do calcnormfactors
if (facile_assay_type %in% c('rnaseq', 'isoseq', 'pseudobulk')) {
## Can create assay_sample_info_table
cnts <- do.call(cbind, dats)
normfactors <- calcNormFactors(cnts)
asi <- tibble(
assay=facile_assay_name,
dataset=rep(names(datasets), sapply(dats, ncol)),
sample_id=colnames(cnts),
hdf5_index=lapply(dats, function(d) seq(ncol(d))) |> unlist(),
libsize=colSums(cnts),
normfactor=normfactors)
rm(cnts)
} else {
asi <- lapply(names(dats), function(ds) {
mtrx <- dats[[ds]]
tibble(
assay=facile_assay_name,
dataset=ds,
sample_id=colnames(mtrx),
hdf5_index=seq(ncol(mtrx)),
libsize=-1, normfactor=-1)
}) |> bind_rows()
}
asi <- append_facile_table(asi, x, 'assay_sample_info', warn_existing)
# "numeric" R storage mode is "double" storage mode in hdf5
if (storage_mode == 'numeric') {
storage_mode <- 'double'
}
assay.dat <- lapply(names(dats), function(ds) {
## This should be addFacileAssay(x, assay_name, dat, subset(asi, ...))
dat <- dats[[ds]]
if (is.character(chunk_cols) && chunk_cols == 'ncol') {
xchunk.cols <- ncol(dat)
} else if (is.integer(chunk_cols)) {
xchunk.cols <- min(chunk_cols, ncol(dat))
} else {
stop("Unrecognized value for chunk.cols: ", chunk_cols)
}
xchunk.rows <- min(chunk_rows=5000, nrow(dat))
chunk <- c(xchunk.rows, xchunk.cols)
dname <- sprintf('%s/%s', aname, ds)
if (chunk_compression == 0) {
h5createDataset(hdf5fn(x), dname, dim(dat), storage.mode = storage_mode,
level = 0, filter = "NONE")
} else {
h5createDataset(hdf5fn(x), dname, dim(dat), storage.mode=storage_mode,
chunk = chunk, level = chunk_compression)
}
## For some reason dispatching on this keeps messing up:
## Error in UseMethod("h5write") (from construction.R#296) :
## no applicable method for 'h5write' applied to an object of class
## "c('matrix', 'integer', 'numeric')"
rhdf5::h5write(dat, file = hdf5fn(x), name = dname)
tibble(assay=facile_assay_name, dataset=ds, sample_id=colnames(dat),
hdf5_index=seq(ncol(dat)))
})
assay.dat <- bind_rows(assay.dat)
## Can check that asi == assay.dat
chk <- inner_join(asi, assay.dat, by=c('assay', 'dataset', 'sample_id'))
stopifnot(nrow(chk) == nrow(asi), all(chk$hdf5_index.x == chk$hdf5_index.y))
# add samples to sample_info table if they're not there.
samples <- asi |>
mutate(parent_id=NA_character_) |>
append_facile_table(x, 'sample_info', warn_existing)
if (add_sample_covariates) {
sample_covariates <- .prepare_sample_covariates(
datasets, covariate_def = covariate_def)
# add sample covariates to table
sample.covs <- sample_covariates$eav |>
mutate(date_entered = as.integer(Sys.time())) |>
append_facile_table(x, "sample_covariate",
warn_existing = warn_existing)
}
# TODO: Add sample covariates for samples that aren't yet loaded -------------
invisible(list(samples=samples, assay_sample_info=asi))
}
#' Appends new features to `feature_info` table
#'
#' This function only adds features (feature_type, feature_id) that are not
#' in the `feature_info` table already
#'
#' @export
#' @param x The `FacileDataSet`
#' @param feature_info a table of new features that provides all columns
#' in `feature_info_tbl(x)`
#' @param type A way to override (or set) the `feature_type` column of the
#' `feature_info` table
#' @return invisible returns an annotated version of the `feature_info`
#' table with an `$added` column with `TRUE/FALSE` values for the
#' features that were new (and added) to the repository or `FALSE` to
#' indicate that they were already in the database.
append_facile_feature_info <- function(x, feature_info,
type = feature_info$feature_type,
warn_existing = FALSE) {
## Argument Checking
stopifnot(is.FacileDataSet(x))
stopifnot(is.data.frame(feature_info))
if (is.null(feature_info$feature_type)) {
stopifnot(is.character(type), length(type) %in% c(1L, nrow(feature_info)))
feature_info$feature_type <- type
}
ftypes <- unique(feature_info$feature_type)
if (length(ftypes) > 1) {
warning("Adding more than one feature_type to feature_info table",
immediate.=TRUE)
}
# "source" may not be defined. If not, let's add it as "unspecified" and warn
# the user
if (is.null(feature_info[["source"]])) {
warning('`"source"` column not specified in feature info, filling in with ',
'"unspecified"', immediate. = TRUE)
feature_info[["source"]] <- rep("unspecifed", nrow(feature_info))
}
added <- feature_info |>
distinct(feature_type, feature_id, .keep_all = TRUE) |>
append_facile_table(x, 'feature_info', warn_existing = warn_existing)
invisible(added)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.