# Notes:
# o first pass error handling improvements are done.
#
#' Build a small cell_data_set.
#' @param matrix_control A list used to control how the counts matrix is stored
#' in the CDS. By default, Monocle3 stores the counts matrix in-memory as a
#' sparse matrix. Setting 'matrix_control=list(matrix_class="BPCells")',
#' stores the matrix on-disk as a sparse matrix.
#' @return cds object
#' @examples
#' \donttest{
#' cds <- load_a549()
#' }
#'
#' @export
load_a549 <- function(matrix_control=list()){
matrix_control_res <- tryCatch(set_matrix_control(matrix_control=matrix_control, matrix_control_default=list(), control_type='unrestricted'),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_a549')) })
small_a549_colData_df <- readRDS(system.file("extdata",
"small_a549_dex_pdata.rda",
package = "monocle3"))
small_a549_rowData_df <- readRDS(system.file("extdata",
"small_a549_dex_fdata.rda",
package = "monocle3"))
small_a549_exprs <- readRDS(system.file("extdata",
"small_a549_dex_exprs.rda",
package = "monocle3"))
small_a549_exprs <- small_a549_exprs[,row.names(small_a549_colData_df)]
expression_matrix <- set_matrix_class(mat=small_a549_exprs, matrix_control=matrix_control_res)
cds <- new_cell_data_set(expression_data = expression_matrix,
cell_metadata = small_a549_colData_df,
gene_metadata = small_a549_rowData_df)
cds <- set_matrix_citation(cds)
cds
}
#' Build a cell_data_set from C. elegans embryo data.
#' @param matrix_control A list used to control how the counts matrix is stored
#' in the CDS. By default, Monocle3 stores the counts matrix in-memory as a
#' sparse matrix. Setting 'matrix_control=list(matrix_class="BPCells")',
#' stores the matrix on-disk as a sparse matrix.
#' @return cds object
#' @importFrom SingleCellExperiment counts
#' @export
load_worm_embryo <- function(matrix_control=list()) {
matrix_control_res <- tryCatch(set_matrix_control(matrix_control=matrix_control, matrix_control_default=list(), control_type='unrestricted'),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_worm_embryo')) })
expression_matrix <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/packer_embryo_expression.rds"))
cell_metadata <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/packer_embryo_colData.rds"))
gene_annotation <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/packer_embryo_rowData.rds"))
gene_annotation$use_for_ordering <- NULL
expression_matrix <- set_matrix_class(mat=expression_matrix, matrix_control=matrix_control_res)
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- set_matrix_citation(cds)
cds <- estimate_size_factors(cds)
cds <- initialize_counts_metadata(cds)
matrix_id <- get_unique_id(counts(cds))
cds <- set_counts_identity(cds, 'URL: https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/packer_embryo_expression.rds', matrix_id)
cds
}
#' Build a cell_data_set from C. elegans L2 data.
#' @param matrix_control A list used to control how the counts matrix is stored
#' in the CDS. By default, Monocle3 stores the counts matrix in-memory as a
#' sparse matrix. Setting 'matrix_control=list(matrix_class="BPCells")',
#' stores the matrix on-disk as a sparse matrix.
#' @return cds object
#' @importFrom SingleCellExperiment counts
#' @export
load_worm_l2 <- function(matrix_control=list()) {
matrix_control_res <- tryCatch(set_matrix_control(matrix_control=matrix_control, matrix_control_default=list(), control_type='unrestricted'),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_worm_l2')) })
expression_matrix <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/cao_l2_expression.rds"))
cell_metadata <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/cao_l2_colData.rds"))
gene_annotation <- readRDS(url("https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/cao_l2_rowData.rds"))
expression_matrix <- set_matrix_class(mat=expression_matrix, matrix_control=matrix_control_res)
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- set_matrix_citation(cds)
cds <- estimate_size_factors(cds)
cds <- initialize_counts_metadata(cds)
matrix_id <- get_unique_id(counts(cds))
cds <- set_counts_identity(cds, 'URL: https://depts.washington.edu:/trapnell-lab/software/monocle3/celegans/data/cao_l2_expression.rds', matrix_id)
cds
}
#' Test if a file has a Matrix Market header.
#' @param matpath Path to test file.
#' @return TRUE if matpath file has Matrix Market header.
#' @noRd
is_matrix_market_file <- function( matpath )
{
first_line <- utils::read.table( matpath, nrows=1 )
grepl( "%%MatrixMarket", first_line$V1 )
}
#' Load matrix dimension metadata from file.
#' @param anno_path Path to matdim annotation file.
#' @return list consisting of matdim_names, which label matrix dimension,
#' and metadata, if present in the file, which are
#' additional dimension metadata.
#' @noRd
load_annotations_data <- function( anno_path, metadata_column_names=NULL, header=FALSE, sep="", quote="\"'", annotation_type=NULL )
{
assertthat::assert_that( ! is.null( annotation_type ) )
annotations <- tryCatch(utils::read.table( anno_path, header=header, sep=sep, quote=quote, stringsAsFactors=FALSE ),
error = function(c) { stop(paste0(trimws(c),
'\n unable to read ', annotation_type, ' file ', anno_path,
'\n note: possible problems include the wrong filename, a missing file,',
'\n and incorrect file format parameters, for example \'header\', \'sep\', and \'quote\'',
'\n', dbar40,
'\n', report_path_status(anno_path, dirname(anno_path)),
'\n', dbar40,
'\n* error in load_annotations_data')) })
metadata = NULL
if( .row_names_info( annotations ) < 0 )
{
names <- annotations[,1]
if( ncol( annotations ) > 1 )
metadata <- annotations[,-1,drop=FALSE]
}
else
{
names <- rownames( annotations )
metadata <- annotations
}
if( ! ( is.null( metadata_column_names ) || is.null( metadata ) ) )
{
assertthat::assert_that( length( metadata_column_names ) == ncol( metadata ),
msg=paste0( annotation_type,
' metadata column name count (',
length( metadata_column_names ),
') != ',
annotation_type,
'annotation column count (',
ncol( metadata ),
')' ) )
colnames( metadata ) <- metadata_column_names
}
if( ! is.null( metadata ) )
rownames(metadata) <- names
list( names=names, metadata=metadata )
}
#' Load data from matrix market format files.
#'
#' @param mat_path Path to the Matrix Market .mtx matrix file. The
#' values are read and stored as a sparse matrix with nrows and ncols,
#' as inferred from the file. Required.
#' @param feature_anno_path Path to a feature annotation file. The
#' feature_anno_path file must have nrows lines and at least one column.
#' The values in the first column label the matrix rows and each must be
#' distinct in the column. Values in additional columns are stored in
#' the cell_data_set 'gene' metadata. For gene features, we urge use of
#' official gene IDs for labels, such as Ensembl or Wormbase IDs. In this
#' case, the second column has typically a 'short' gene name. Additional
#' information such as gene_biotype may be stored in additional columns
#' starting with column 3. Required.
#' @param cell_anno_path Path to a cell annotation file. The cell_anno_path
#' file must have ncols lines and at least one column. The values in the
#' first column label the matrix columns and each must be distinct in the
#' column. Values in additional columns are stored in the cell_data_set
#' cells metadata. Required.
#' @param header Logical set to TRUE if both feature_anno_path and
#' cell_anno_path files have column headers, or set to FALSE if both
#' files do not have column headers (only these cases are supported).
#' The files may have either ncols or ncols-1 header fields. In both
#' cases, the first column is used as the matrix dimension names. The
#' default is FALSE.
#' @param feature_metadata_column_names A character vector of feature
#' metadata column names. The number of names must be one less than the
#' number of columns in the feature_anno_path file. These values
#' will replace those read from the feature_anno_path file header,
#' if present. The default is NULL.
#' @param cell_metadata_column_names A character vector of cell
#' metadata column names. The number of names must be one less than the
#' number of columns in the cell_anno_path file. These values will
#' replace those read from the cell_anno_path file header, if present.
#' The default is NULL.
#' @param quote A character string specifying the quoting characters
#' used in the feature_anno_path and cell_anno_path files. The default
#' is "\"'".
#' @param umi_cutoff UMI per cell cutoff. Columns (cells) with less
#' than umi_cutoff total counts are removed from the matrix. The
#' default is 100.
#' @param sep field separator character in the annotation files. If
#' sep = "", the separator is white space, that is, one or more spaces,
#' tabs, newlines, or carriage returns. The default is the tab
#' character for tab-separated-value files.
#' @param verbose a logical value that determines whether or not the
#' function writes diagnostic information.
#' @param matrix_control an optional list of values that control how
#' matrices are stored in the cell_data_set assays slot. Typically,
#' matrices are stored in-memory as dgCMatrix class (compressed sparse
#' matrix) objects using matrix_class="dgCMatrix". This is the
#' default. A very large matrix can be stored in a file and accessed
#' by Monocle3 as if it were in-memory. For this, Monocle3 uses the
#' BPCells R package. Here the matrix_control list values are set to
#' matrix_class="BPCells" and matrix_mode="dir". Then the counts matrix
#' is stored in a directory, on-disk, which is created by Monocle3 in
#' the directory where you run Monocle3. This directory has a name
#' with the form "monocle.bpcells.*.tmp" where the asterisk is a
#' string of random characters that makes the name unique. Do not
#' remove this directory while Monocle3 is running! If you choose to
#' store the counts matrix as an on-disk BPCells object, you must use
#' the "save_monocle_objects" and "load_monocle_objects" functions
#' to save and restore the cell_data_set. Monocle3 tries to remove
#' the BPCells matrix directory when your R session ends; however,
#' sometimes a matrix directory may persist after the session ends.
#' In this case, the user must remove the directory after the
#' session ends. For additional information about the matrix_control
#' list, see the examples below and the set_matrix_control help.
#' Note that for the load_mm_data function the BPCells matrix_mode
#' is "dir", the matrix_type is "double", and the matrix_compress is
#' FALSE.
#' @return cds object
#'
#' @section Comments:
#' * load_mm_data estimates size factors.
#'
#' @examples
#' \donttest{
#' pmat<-system.file("extdata", "matrix.mtx.gz", package = "monocle3")
#' prow<-system.file("extdata", "features_c3h0.txt", package = "monocle3")
#' pcol<-system.file("extdata", "barcodes_c2h0.txt", package = "monocle3")
#' cds <- load_mm_data( pmat, prow, pcol,
#' feature_metadata_column_names =
#' c('gene_short_name', 'gene_biotype'), sep='' )
#'
#' # In this example, the features_c3h0.txt file has three columns,
#' # separated by spaces. The first column has official gene names, the
#' # second has short gene names, and the third has gene biotypes.
#' #
#' # For typical count matrices with a small to medium number of cells,
#' # we suggest that you use the default matrix_control list by not
#' # not setting the matrix_control parameter. In this case, the
#' # counts matrix is stored in-memory as a sparse matrix in the
#' # dgCMatrix format, as it has in the past. It is also possible to
#' # set the matrix_control list explicitly to use this in-memory
#' # dgCMatrix format by setting the matrix_control parameter to
#' #
#' load_mm_data(..., matrix_control=list(matrix_class='dgCMatrix'))
#' #
#' # For large matrices, we suggest that you try storing the count
#' # matrix as a BPCells object on-disk by setting the matrix_control
#' # parameter list as follows
#' #
#' load_mm_data(..., matrix_control=list(matrix_class='BPCells'))
#' #
#' }
#'
#' @importFrom SingleCellExperiment counts
#' @export
load_mm_data <- function( mat_path,
feature_anno_path,
cell_anno_path,
header = FALSE,
feature_metadata_column_names = NULL,
cell_metadata_column_names = NULL,
umi_cutoff = 100,
quote="\"'",
sep="\t",
verbose=FALSE,
matrix_control=list()) {
assertthat::assert_that(assertthat::is.readable(mat_path), msg='unable to read matrix file')
assertthat::assert_that(assertthat::is.readable(feature_anno_path), msg='unable to read feature annotation file')
assertthat::assert_that(assertthat::is.readable(cell_anno_path), msg='unable to read cell annotation file')
assertthat::assert_that(is.numeric(umi_cutoff))
matrix_control_res <- tryCatch(set_matrix_control(matrix_control=matrix_control, matrix_control_default=list(), control_type='unrestricted'),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_mm_data')) })
feature_annotations <- tryCatch(load_annotations_data( feature_anno_path, feature_metadata_column_names, header, sep, quote=quote, annotation_type='features' ),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_mm_data')) })
cell_annotations <- tryCatch(load_annotations_data( cell_anno_path, cell_metadata_column_names, header, sep, quote=quote, annotation_type='cells' ),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_mm_data')) })
assertthat::assert_that( ! any( duplicated( feature_annotations$names ) ), msg='duplicate feature names in feature annotation file' )
assertthat::assert_that( ! any( duplicated( cell_annotations$names ) ), msg='duplicate cell names in cell annotation file' )
if(matrix_control_res[['matrix_class']] != 'BPCells') {
# Read MatrixMarket file and convert to dgCMatrix format.
mat <- Matrix::readMM(mat_path)
mat <- set_matrix_class(mat=mat, matrix_control=matrix_control_res)
assertthat::assert_that( length( feature_annotations$names ) == nrow( mat ),
msg=paste0( 'feature name count (',
length( feature_annotations$names ),
') != matrix row count (',
nrow( mat ),
')' ) )
assertthat::assert_that( length( cell_annotations$names ) == ncol( mat ),
msg=paste0( 'cell name count (',
length( cell_annotations$names ),
') != matrix column count (',
ncol( mat ),
')' ) )
rownames( mat ) <- feature_annotations$names
colnames( mat ) <- cell_annotations$names
}
else {
toutdir <- tempfile(pattern=paste0('monocle.bpcells.',
format(Sys.Date(), format='%Y%m%d'), '.'),
tmpdir=matrix_control_res[['matrix_path']],
fileext='.tmp')[[1]]
tmpdir <- tempfile('monocle.import_mm.', '.', '.tmp')
tmat <- tryCatch(BPCells::import_matrix_market(mtx_path=mat_path,
outdir=toutdir,
row_names=feature_annotations$names,
col_names=cell_annotations$names,
row_major=FALSE,
tmpdir=tmpdir,
load_bytes=4194304L,
sort_bytes=1073741824L),
error = function(c) { stop(paste0(trimws(c),
'\n unable to read file ', mat_path,
'\n', dbar40,
'\n', report_path_status(mat_path, dirname(mat_path)),
'\n', dbar40,
'\n* error in load_mm_data')) })
unlink(tmpdir, recursive=TRUE)
outdir_c <- tempfile(pattern=paste0('monocle.bpcells.',
format(Sys.Date(), format='%Y%m%d'), '.'),
tmpdir=matrix_control_res[['matrix_path']],
fileext='.tmp')[[1]]
mat <- tryCatch(BPCells::write_matrix_dir(BPCells::convert_matrix_type(tmat, 'double'), outdir_c, compress=FALSE, buffer_size=8192L, overwrite=FALSE),
error = function(c) { stop(paste0(trimws(c),
'\n error make row order counts matrix',
'\n', dbar40,
'\n', report_path_status(dirname(outdir_c), dirname(tmat)),
'\n', dbar40,
'\n* error in load_mm_data')) })
unlink(toutdir, recursive=TRUE)
push_matrix_path(mat=mat)
}
cds <- new_cell_data_set(mat,
cell_metadata = cell_annotations$metadata,
gene_metadata = feature_annotations$metadata,
verbose = verbose)
cds <- set_matrix_citation(cds)
if(is(counts(cds), 'CsparseMatrix')) {
colData(cds)$n.umi <- Matrix::colSums(counts(cds))
}
else
if(is(counts(cds), 'IterableMatrix')) {
colData(cds)$n.umi <- BPCells::colSums(counts(cds))
}
cds <- cds[,colData(cds)$n.umi >= umi_cutoff]
cds <- estimate_size_factors(cds)
cds <- initialize_counts_metadata(cds)
matrix_id <- get_unique_id(counts(cds))
cds <- set_counts_identity(cds, mat_path, matrix_id)
if(verbose) {
message('load_mm_data: matrix_info: out:')
show_matrix_info(matrix_info=get_matrix_info(mat=mat), ' ')
}
return(cds)
}
#' Load data from matrix market format
#'
#' @param mat_path Path to the .mtx matrix market file.
#' @param gene_anno_path Path to gene annotation file.
#' @param cell_anno_path Path to cell annotation file.
#' @param umi_cutoff UMI per cell cutoff, default is 100.
#' @param matrix_control A list used to control how the counts matrix is stored
#' in the CDS. By default, Monocle3 stores the counts matrix in-memory as a
#' sparse matrix. Setting 'matrix_control=list(matrix_class="BPCells")',
#' stores the matrix BPCells on-disk as a sparse matrix.
#'
#' @return cds object
#' @importFrom SingleCellExperiment counts
#'
#' @examples
#' \donttest{
#' pmat<-system.file("extdata", "matrix.mtx.gz", package = "monocle3")
#' prow<-system.file("extdata", "features_c3h0.txt", package = "monocle3")
#' pcol<-system.file("extdata", "barcodes_c2h0.txt", package = "monocle3")
#' cds <- load_mtx_data( pmat, prow, pcol)
#' }
#'
#' @export
load_mtx_data <- function( mat_path,
gene_anno_path,
cell_anno_path,
umi_cutoff = 100,
matrix_control=list()) {
assertthat::assert_that(assertthat::is.readable(mat_path))
assertthat::assert_that(assertthat::is.readable(gene_anno_path))
assertthat::assert_that(assertthat::is.readable(cell_anno_path))
assertthat::assert_that(is.numeric(umi_cutoff))
if( is_matrix_market_file( mat_path ) )
{
#
# Read an feature annotation file with two tab-separated
# columns where the second column has short gene names.
# Read a cell annotation file with one column that has the
# matrix row names.
#
cds <- tryCatch(load_mm_data( mat_path,
gene_anno_path,
cell_anno_path,
feature_metadata_column_names=c('gene_short_name'),
umi_cutoff=umi_cutoff,
sep="\t",
verbose=FALSE,
matrix_control=matrix_control),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_mtx_data')) })
return( cds )
}
matrix_control_res <- tryCatch(set_matrix_control(matrix_control=matrix_control, matrix_control_default=list(), control_type='unrestricted'),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_mtx_data')) })
df <- utils::read.table(mat_path, col.names = c("gene.idx", "cell.idx", "count"),
colClasses = c("integer", "integer", "integer"))
gene.annotations <- utils::read.table(gene_anno_path,
col.names = c("id", "gene_short_name"),
colClasses = c("character", "character"))
cell.annotations <- utils::read.table(cell_anno_path, col.names = c("cell"),
colClasses = c("character"))
rownames(gene.annotations) <- gene.annotations$id
rownames(cell.annotations) <- cell.annotations$cell
# add a dummy cell to ensure that all genes are included in the matrix
# even if a gene isn't expressed in any cell
df <- rbind(df, data.frame(gene.idx = c(1, nrow(gene.annotations)),
cell.idx = rep(nrow(cell.annotations) + 1, 2),
count = c(1, 1)))
mat <- Matrix::sparseMatrix(i = df$gene.idx, j = df$cell.idx, x = df$count)
if(ncol(mat) == 1) {
mat <- mat[,0, drop=FALSE]
}
else {
if(ncol(mat) < 2) warning('bad loop: ncol(mat) < 2')
mat <- mat[, 1:(ncol(mat)-1), drop=FALSE]
}
mat <- set_matrix_class(mat=mat, matrix_control=matrix_control_res)
rownames(mat) <- gene.annotations$id
colnames(mat) <- cell.annotations$cell
cds <- new_cell_data_set(mat, cell_metadata = cell.annotations,
gene_metadata = gene.annotations)
cds <- set_matrix_citation(cds)
colData(cds)$n.umi <- Matrix::colSums(counts(cds))
cds <- cds[,colData(cds)$n.umi >= umi_cutoff]
cds <- estimate_size_factors(cds)
cds <- initialize_counts_metadata(cds)
matrix_id <- get_unique_id(counts(cds))
cds <- set_counts_identity(cds, mat_path, matrix_id)
return(cds)
}
update_annoy_index <- function(annoy) {
if(!is.null(annoy[['nn_index']][['version']]) && annoy[['nn_index']][['version']] == 2) {
annoy_out <- annoy
}
else
if(!is.null(annoy[['nn_index']][['version']]) && annoy[['nn_index']][['version']] == 1) {
annoy_out <- S4Vectors::SimpleList()
annoy_out[['nn_index']] <- list()
annoy_out[['nn_index']][['method']] <- 'annoy'
annoy_out[['nn_index']][['annoy_index']] <- annoy[['nn_index']][['annoy_index']]
annoy_out[['nn_index']][['version']] <- 2
annoy_out[['nn_index']][['annoy_index_version']] <- annoy[['index_version']]
annoy_out[['nn_index']][['metric']] <- annoy[['metric']]
annoy_out[['nn_index']][['n_trees']] <- annoy[['n_trees']]
annoy_out[['nn_index']][['nrow']] <- annoy[['nrow']]
annoy_out[['nn_index']][['ncol']] <- annoy[['ncol']]
annoy_out[['nn_index']][['checksum_rownames']] <- NA_character_
annoy_out[['matrix_id']] <- annoy[['matrix_id']]
annoy_out[['updated']] <- TRUE
}
else
if(!is.null(annoy[['nn_index']][['type']]) && annoy[['nn_index']][['type']] == 'annoyv1') {
annoy_out <- S4Vectors::SimpleList()
annoy_out[['nn_index']] <- list()
annoy_out[['nn_index']][['method']] <- 'annoy'
annoy_out[['nn_index']][['annoy_index']] <- annoy[['nn_index']][['ann']]
annoy_out[['nn_index']][['version']] <- 2
annoy_out[['nn_index']][['annoy_index_version']] <- annoy[['index_version']]
annoy_out[['nn_index']][['metric']] <- annoy[['metric']]
annoy_out[['nn_index']][['n_trees']] <- annoy[['n_trees']]
annoy_out[['nn_index']][['nrow']] <- annoy[['nrow']]
annoy_out[['nn_index']][['ncol']] <- annoy[['ncol']]
annoy_out[['nn_index']][['checksum_rownames']] <- NA_character_
annoy_out[['matrix_id']] <- annoy[['matrix_id']]
annoy_out[['updated']] <- TRUE
}
else
if(is.null(annoy[['nn_index']][['type']])) {
annoy_out <- S4Vectors::SimpleList()
annoy_out[['nn_index']] <- list()
annoy_out[['nn_index']][['method']] <- 'annoy'
annoy_out[['nn_index']][['annoy_index']] <- annoy[['nn_index']]
annoy_out[['nn_index']][['version']] <- 2
annoy_out[['nn_index']][['annoy_index_version']] <- NA_character_
annoy_out[['nn_index']][['metric']] <- annoy[['metric']]
annoy_out[['nn_index']][['metric']] <- ifelse(!is.null(annoy[['metric']]), annoy[['metric']], NA_character_)
annoy_out[['nn_index']][['n_trees']] <- ifelse(!is.null(annoy[['n_trees']]), annoy[['n_trees']], NA_integer_)
annoy_out[['nn_index']][['nrow']] <- ifelse(!is.null(annoy[['n_trees']]), annoy[['n_trees']], NA_integer_)
annoy_out[['nn_index']][['ncol']] <- ifelse(!is.null(annoy[['n_trees']]), annoy[['n_trees']], NA_integer_)
annoy_out[['nn_index']][['checksum_rownames']] <- NA_character_
annoy_out[['matrix_id']] <- NA_character_
annoy_out[['updated']] <- TRUE
}
return(annoy_out)
}
update_hnsw_index <- function(hnsw) {
if(!is.null(hnsw[['nn_index']][['version']]) && hnsw[['nn_index']][['version']] == 1) {
hnsw_out <- hnsw
}
else
if(!is.null(hnsw[['nn_index']][['version']]) && hnsw[['nn_index']][['version']] == 1) {
hnsw_out <- S4Vectors::SimpleList()
annoy_out[['nn_index']] <- list()
hnsw_out[['nn_index']][['method']] <- 'hnsw'
hnsw_out[['nn_index']][['hnsw_index']] <- hnsw[['nn_index']]
hnsw_out[['nn_index']][['version']] <- 1
hnsw_out[['nn_index']][['hnsw_index_version']] <- hnsw[['index_version']]
hnsw_out[['nn_index']][['metric']] <- hnsw[['metric']]
hnsw_out[['nn_index']][['M']] <- hnsw[['M']]
hnsw_out[['nn_index']][['ef_construction']] <- hnsw[['ef_construction']]
hnsw_out[['nn_index']][['nrow']] <- hnsw[['nrow']]
hnsw_out[['nn_index']][['ncol']] <- hnsw[['ncol']]
hnsw_out[['nn_index']][['checksum_rownames']] <- NA_character_
hnsw_out[['matrix_id']] <- hnsw[['matrix_id']]
annoy_out[['updated']] <- TRUE
}
return(hnsw_out)
}
# Save Annoy model files.
# Complexities
# o uwot annoy model object is declared to be subject to change
# o uwot annoy object changes some time between releases
# 0.1.8 and 0.1.10: the annoy index moved from
# object$nn_index to object$nn_index$ann
# o the uwot annoy object may have more than one metric
# o see uwot/R/uwot.R functions save_uwot and load_uwot
# Notes
# o uwot 0.1.10 has nn_index$type='annoyv1', which may be a
# umap annoy object version number. It does not exist for
# uwot 0.1.8. nn_index$ann does not exist either.
# o the nn_index parameter is the nn_index object in the
# uwot umap model and returned by uwot::annoy_build()
# uwot v0.1.8::annoy_build() returns
# ann <- uwot::create_ann()
# ...
# return ann
# uwot v0.1.10::annoy_build() returns
# annoy <- annoy_create(metric, nc)
# ...
# ann <- annoy$ann
# ...
# for (i in 1:nr) {
# ann$addItem(i - 1, X[i, ])
# }
# ...
# return annoy
# where annoy is assigned to nn_index in either the
# UMAP model or by uwot::annoy_build().
# untested code when is.null(nn_index[['type']])
save_annoy_index <- function(nn_index, file_name) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(!is.null(nn_index[['version']])) {
if(nn_index[['version']] == 1 || nn_index[['version']] == 2) {
tryCatch( nn_index[['annoy_index']]$save(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error saving file ', file_name,
'\n', dbar40,
'\n', report_path_status(dirname(file_name)),
'\n', dbar40,
'\n* error in save_annoy_index')) })
}
else {
stop('Unrecognized Monocle3 annoy index type')
}
}
else
if(!is.null(nn_index[['type']])) {
if(nn_index[['type']] == 'annoyv1') {
tryCatch( nn_index[['ann']]$save(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error saving file ', file_name,
'\n', dbar40,
'\n', report_path_status(dirname(file_name)),
'\n', dbar40,
'\n* error in save_annoy_index')) })
}
else {
stop('Unrecognized uwot annoy index type')
}
}
else {
nn_index$save(file_name)
}
}
# see comments for save_annoy_index
# modified for RcppAnnoy
load_annoy_index <- function(nn_index, file_name, metric, ndim) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(!is.null(nn_index[['version']])) {
if(nn_index[['version']] == 1 || nn_index[['version']] == 2) {
annoy_index <- tryCatch(new_annoy_index(metric, ndim),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_annoy_index')) })
tryCatch(
annoy_index$load(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_annoy_index')) })
nn_index[['annoy_index']] <- annoy_index
}
else {
stop('Unrecognized annoy index type')
}
}
else
if(!is.null(nn_index[['type']])) {
if(nn_index[['type']] == 'annoyv1') {
annoy_index <- tryCatch(new_annoy_index(metric, ndim),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_annoy_index')) })
tryCatch(
annoy_index$load(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_annoy_index')) })
nn_index[['ann']] <- annoy_index
}
else {
stop('Unrecognized annoy index type')
}
}
else {
# Assume to be an older uwot annoy index version.
nn_index <- tryCatch(new_annoy_index(metric, ndim),
error = function(c) {stop(paste0(trimws(c), '\n* error in load_annoy_index')) })
nn_index$load(file_name)
}
return(nn_index)
}
save_umap_annoy_index <- function(nn_index, file_name) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(!is.null(nn_index[['type']])) {
if(nn_index[['type']] == 'annoyv1') {
tryCatch( nn_index[['ann']]$save(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error saving annoy index file ', file_name,
'\n', dbar40,
'\n', report_path_status(dirname(file_name)),
'\n', dbar40,
'\n* error in load_umap_annoy_index')) })
}
else {
stop('Unrecognized umap annoy index type')
}
}
else {
nn_index$save(file_name)
}
}
load_umap_annoy_index <- function(nn_index, file_name, metric, ndim) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(!is.null(nn_index[['type']])) {
if(nn_index[['type']] == 'annoyv1') {
annoy_index <- tryCatch(new_annoy_index(metric, ndim),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_umap_annoy_index')) })
tryCatch(annoy_index$load(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_umap_annoy_index')) })
nn_index[['ann']] <- annoy_index
}
else {
stop('Unrecognized umap annoy index type')
}
}
else {
# Assume to be an older uwot annoy index version.
nn_index <- tryCatch(new_annoy_index(metric, ndim),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_umap_annoy_index')) })
tryCatch(
nn_index$load(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_umap_annoy_index')) })
}
return(nn_index)
}
save_hnsw_index <- function(nn_index, file_name) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(is.null(nn_index)) return()
if(!is.null(nn_index[['version']])) {
out_index <- nn_index[['hnsw_index']]
}
else {
out_index <- nn_index
}
tryCatch(out_index$save(file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error writing file ', file_name,
'\n', dbar40,
'\n', report_path_status(dirname(file_name)),
'\n', dbar40,
'\n* error in save_hnsw_index')) })
}
load_hnsw_index <- function(nn_index, file_name, metric, ndim) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(metric == 'l2') {
new_index <- tryCatch(
methods::new(RcppHNSW::HnswL2, ndim, file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in save_hnsw_index')) })
}
else
if(metric == 'euclidean') {
new_index <- tryCatch(
methods::new(RcppHNSW::HnswL2, ndim, file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in save_hnsw_index')) })
attr(new_index, "distance") <- "euclidean"
}
else
if(metric == 'cosine') {
new_index <- tryCatch(
methods::new(RcppHNSW::HnswCosine, ndim, file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in save_hnsw_index')) })
}
else
if(metric == 'ip') {
new_index <- tryCatch(
methods::new(RcppHNSW::HnswIp, ndim, file_name),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in save_hnsw_index')) })
}
else
stop('Unrecognized HNSW metric ', metric)
if(!is.null(nn_index[['version']]))
nn_index[['hnsw_index']] <- new_index
else
nn_index <- new_index
return(nn_index)
}
# Save umap annoy indices to files and return md5sum
# value(s) as either a character string, in case of
# one metric, or a list, in case of more than one matric.
save_umap_nn_indexes <- function(umap_model, file_name) {
file_name <- normalizePath(file_name, mustWork=FALSE)
if(is.null(umap_model[['metric']])) {
stop(paste0('\n No UMAP model in this CDS',
'\n* error in save_umap_nn_indexes'))
}
metrics <- names(umap_model[['metric']])
n_metrics <- length(metrics)
if(n_metrics == 1) {
tryCatch(save_umap_annoy_index(umap_model[['nn_index']], file_name),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_umap_nn_indexes')) })
md5sum_umap_index <- tools::md5sum(file_name)
if(is.na(md5sum_umap_index)) {
stop(paste0('\n unable to read file for checksum: ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in save_umap_nn_indexes'))
}
}
else {
warning('save_umap_nn_indexes is untested with more than one umap metric')
md5sum_vec <- character()
for(i in seq(1, n_metrics, 1)) {
file_name_expand <- paste0(file_name, i)
tryCatch(save_umap_annoy_index(umap_model[['nn_index']][[i]], file_name_expand),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_umap_nn_indexes')) })
md5sum <- tools::md5sum(file_name_expand)
if(is.na(md5sum)) {
stop(paste0('\n unable to read file for checksum: ', file_name_expand,
'\n', dbar40,
'\n', report_path_status(file_name_expand, dirname(file_name_expand)),
'\n', dbar40,
'\n* error in save_umap_nn_indexes'))
}
append(md5sum_vec, md5sum)
}
md5sum_umap_index <- paste(md5sum_vec, collapse='_')
}
return(md5sum_umap_index)
}
# Load umap annoy indices into umap_model and return umap_model.
load_umap_nn_indexes <- function(umap_model, file_name, md5sum_umap_index) {
file_name <- normalizePath(file_name, mustWork=FALSE)
metrics <- names(umap_model[['metric']])
n_metrics <- length(metrics)
if(n_metrics == 1) {
md5sum <- tools::md5sum(file_name)
if(is.na(md5sum)) {
stop(paste0('\n unable to read file for checksum: ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_umap_nn_indexes'))
}
# Don't check the md5sum when md5sum_umap_index is NA in file_index.rds in order to let the user circumvent the test.
if(!is.na(md5sum_umap_index) && !is.null(md5sum_umap_index) && md5sum != md5sum_umap_index) {
stop(report_checksum_difference('load_umap_nn_indexes', file_name, md5sum_umap_index, md5sum))
}
metric <- metrics[[1]]
annoy_metric <- if(metric == 'correlation') 'cosine' else metric
annoy_ndim <- umap_model[['metric']][[1]][['ndim']]
umap_model[['nn_index']] <- tryCatch(load_umap_annoy_index(umap_model[['nn_index']], file_name, annoy_metric, annoy_ndim),
error = function(c) {stop(paste0(trimws(c), '\n* error in load_umap_nn_indexes')) })
}
else {
warning('load_umap_nn_indexes is untested with more than one umap metric')
if(!is.null(md5sum_umap_index)) {
md5sum_vec <- unlist(strsplit(md5sum_umap_index, '_', fixed=TRUE))
}
for(i in seq(1, n_metrics, 1)) {
file_name_expand <- paste0(file_name, i)
md5sum <- tools::md5sum(file_name_expand)
if(is.na(md5sum)) {
stop(paste0('\n unable to read file for checksum: ', file_name_expand,
'\n', dbar40,
'\n', report_path_status(file_name_expand, dirname(file_name_expand)),
'\n', dbar40,
'\n* error in load_umap_nn_indexes'))
}
# Don't check the md5sum when md5sum_umap_index is NA in file_index.rds in order to let the user circumvent the test.
if(!is.na(md5sum_vec[[i]]) && !is.null(md5sum_vec[[i]]) && md5sum != md5sum_vec[[i]]) {
stop(report_checksum_difference('load_umap_nn_indexes', file_name_expand, md5sum_vec[[i]], md5sum))
}
metric <- metrics[[i]]
annoy_metric <- if(metric == 'correlation') 'cosine' else metric
annoy_ndim <- length(umap_model[['metric']][[i]])
umap_model[['nn_index']][[i]] <- tryCatch(load_umap_annoy_index(umap_model[['nn_index']][[i]], file_name_expand, annoy_metric, annoy_ndim),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_umap_nn_indexes')) })
}
}
return(umap_model)
}
# This is a specialized function for use in load_monocle_objects. There are
# no matrix_control checks and it requires the path to an existing
# BPCells matrix stored in a directory. The matrix control is used only
# to set the resulting matrix_path.
load_bpcells_matrix_dir <- function(file_name, md5sum, matrix_control=list()) {
file_name <- normalizePath(file_name, mustWork=FALSE)
md5sum_file <- tryCatch(bpcells_matdir_md5(file_name),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_bpcells_matrix_dir')) })
# Don't check the md5sum when md5sum_file is NA in file_index.rds in order to let the user circumvent the test.
if(!is.na(md5sum) && md5sum_file != md5sum) {
stop(report_checksum_difference('load_bpcells_matrix_dir', basename(file_name), md5sum, md5sum_file))
}
matrixDirTmp <- tryCatch(BPCells::open_matrix_dir(dir=file_name, buffer_size=8192L),
error=function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_name,
'\n', dbar40,
'\n', report_path_status(file_name, dirname(file_name)),
'\n', dbar40,
'\n* error in load_bpcells_matrix_dir'))
},
warning=function(c) {
msg <- conditionMessage(c)
message('load_bpcells_matrix_dir: warning reading file: ', file_name, ': ', msg)
}
)
matrix_info <- get_matrix_info(mat=matrixDirTmp)
matrix_control_res <- list(matrix_class=matrix_info[['matrix_class']],
matrix_mode=matrix_info[['matrix_mode']],
matrix_type=matrix_info[['matrix_type']],
matrix_compress=matrix_info[['matrix_compress']],
matrix_path=matrix_control[['matrix_path']],
matrix_buffer_size=matrix_info[['matrix_buffer_size']],
matrix_bpcells_copy=TRUE)
matrixDir <- set_matrix_class(mat=matrixDirTmp, matrix_control=matrix_control_res)
return(matrixDir)
}
#
# Report files saved.
#
report_files_saved <- function(file_index) {
appendLF <- TRUE
files <- file_index[['files']]
message('Info: save_monocle_objects: saving monocle object files:')
for( i in seq_along(files[['cds_object']])) {
cds_object <- files[['cds_object']][[i]]
reduction_method <- files[['reduction_method']][[i]]
if(cds_object == 'cds') {
process <- 'cell_data_set'
reduction_method <- 'full_cds'
}
else
if(cds_object == 'reduce_dim_aux') {
if(reduction_method == 'Aligned') {
process <- 'align_cds'
}
else
if(reduction_method == 'PCA' || reduction_method == 'LSI') {
process <- 'preprocess_cds'
}
else
if(reduction_method == 'tSNE' || reduction_method == 'UMAP') {
process <- 'reduce_dimension'
}
else {
stop('Unrecognized preprocess reduction_method \'', reduction_method, '\'')
}
}
else
if(cds_object == 'bpcells_matrix_dir') {
process <- 'BPCells MatrixDir'
reduction_method <- 'full_counts_matrix'
}
else {
stop('Unrecognized cds_object value \'', files[['cds_object']][[i]], '\'')
}
file_format <- files[['file_format']][[i]]
if(file_format == 'rds') {
file_type <- 'RDS'
}
else
if(file_format == 'hdf5') {
file_type <- 'RDS_HDF5'
}
else
if(file_format == 'annoy_index' || file_format == 'hnsw_index') {
file_type <- 'NN_index'
}
else
if(file_format == 'umap_annoy_index') {
file_type <- 'UMAP_NN_index'
}
else
if(file_format == 'BPCells:MatrixDir') {
file_type <- 'BPCells:MatrixDir'
}
else {
stop('Unrecognized file_format value \'', file_format, '\'')
}
file_name <- basename(files[['file_path']][[i]])
message(' ', file_name, ' (', reduction_method, ': ', file_type, ' from ', process, ')', appendLF=appendLF)
}
}
check_monocle_object_files <- function( directory_path, file_index, read_test=FALSE, verbose=FALSE ) {
if(read_test == TRUE) {
message('Info: check_monocle_object_files: read_test is not implemented yet.')
}
message('Info: checking for monocle object files...')
error_list <- list()
for(ifile in seq_along(file_index[['files']][['cds_object']])) {
file_path <- file.path(directory_path, file_index[['files']][['file_path']][[ifile]])
file_format <- file_index[['files']][['file_format']][[ifile]]
cds_object <- file_index[['files']][['cds_object']][[ifile]]
reduction_method <- file_index[['files']][['reduction_method']][[ifile]]
md5sum <- file_index[['files']][['file_md5sum']][[ifile]]
message(' ', file_path, '...', appendLF=FALSE)
if(cds_object == 'cds') {
if(file_format == 'rds') {
if(file.exists(file_path)) {
message('OK')
}
else{
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else
if(file_format == 'hdf5') {
if(file.exists(file_path)) {
message('OK')
}
else {
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else {
stop('Unrecognized cds format value \'', file_format, '\'')
}
}
else
if(cds_object == 'reduce_dim_aux') {
if(file_format == 'rds') {
if(file.exists(file_path)) {
message('OK')
}
else {
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else
if(file_format == 'annoy_index') {
if(file.exists(file_path)) {
message('OK')
}
else {
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else
if(file_format == 'hnsw_index') {
if(file.exists(file_path)) {
message('OK')
}
else
{
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else
if(reduction_method == 'UMAP' && file_format == 'umap_annoy_index') {
if(file.exists(file_path)) {
message('OK')
}
else {
message(' *** file missing ***')
error_list[[length(error_list)+1]] <- paste0('missing file: ', file_path)
}
}
else {
stop('Unrecognized file format value \'', file_format, '\'')
}
}
else
if(cds_object == 'bpcells_matrix_dir') {
if(dir.exists(file_path)) {
message('OK')
}
else {
message(' *** directory missing ***')
error_list[[length(error_list)+1]] <- paste0('missing directory: ', file_path)
}
}
else {
stop('Unrecognized cds_object value \'', cds_object, '\'')
}
}
if(length(error_list) > 0) {
msg <- 'check_monocle_object_files: '
msg <- paste0(msg, '\n ', error_list, collapse='\n')
stop(msg)
}
else {
message('Info: all expected monocle object files exist.')
}
return(0)
}
# Make a tar file of an output directory.
make_tar_of_dir <- function(directory_path, archive_control) {
message('Info: making a tar file of the output directory...')
# Make a tar file of output directory, if requested.
if(archive_control[['archive_compression']] == 'gzip') {
archive_name <- paste0(directory_path, '.tar.gz')
}
else
if(archive_control[['archive_compression']] == 'bzip2') {
archive_name <- paste0(directory_path, '.tar.bz2')
}
else
if(archive_control[['archive_compression']] == 'xz') {
archive_name <- paste0(directory_path, '.tar.xz')
}
else {
archive_name <- paste0(directory_path, '.tar')
}
stat <- tryCatch({
tar(tarfile=archive_name,
files=directory_path,
compression=archive_control[['archive_compression']])
},
error=function(c) { stop(paste0(trimws(c),
'\n error writing file ', archive_name,
'\n', dbar40,
'\n', report_path_status(directory_path, dirname(directory_path)),
'\n', dbar40,
'\n* error in make_tar_of_dir'))
},
finally={
message(paste0(' made tar archive file \"', archive_name, '\"'))
}
) # tryCatch
if(stat != 0) {
stop(paste0('\n tar stopped with a non-zero status.',
'\n This may be an error: please check that the tar file can be',
'\n read using the \'tar\' command with the \'t\' option.'))
}
message(' Done.')
}
set_archive_control <- function(archive_control=list()) {
archive_control_default <- get_global_variable('archive_control')
if(is.null(archive_control_default[['archive_type']])) {
archive_control_default[['archive_type']] <- 'tar'
}
if(is.null(archive_control_default[['archive_compression']])) {
archive_control_default[['archive_compression']] <- 'none'
}
if(is.null(archive_control[['archive_type']])) {
archive_control[['archive_type']] <- archive_control_default[['archive_type']]
}
if(is.null(archive_control[['archive_compression']])) {
archive_control[['archive_compression']] <- archive_control_default[['archive_compression']]
}
return(archive_control)
}
#
#' Save cell_data_set transform models.
#'
#' Save the transform models in the cell_data_set to the
#' specified directory by writing the R objects to RDS
#' files and the nearest neighbor indices to
#' index files. save_transform_models saves transform
#' models made by running the preprocess_cds and
#' reduce_dimension functions on an initial cell_data_set.
#' Subsequent cell_data_sets are transformed into the
#' reduced dimension space of the initial cell_data_set by
#' loading the new data into a new cell_data_set, loading
#' the initial data set transform models into the new
#' cell_data_set using the load_transform_models function,
#' and applying those transform models to the new data set
#' using the preprocess_transform() and
#' reduce_dimension_transform() functions. In this case, do
#' not run the preprocess_cds() or reduce_dimension()
#' functions on the new cell_data_set. Additionally,
#' save_transform_models() saves nearest neighbor indices
#' when the preprocess_cds() and reduce_dimension()
#' functions are run with the make_nn_index=TRUE parameter.
#' These indices are used to find matches between cells in
#' the new processed cell_data_set and the initial
#' cell_data_set using index search functions. For more
#' information see the help for transfer_cell_labels().
#' save_transform_models() saves the models to a directory
#' given by the directory_path parameter.
#'
#' @param cds a cell_data_set with existing models.
#' @param directory_path a string giving the name of the directory
#' in which to write the model files.
#' @param comment a string with optional notes that is saved with
#' the objects.
#' @param verbose a boolean determining whether to print information
#' about the saved files.
#' @param archive_control a list that is used to control archiving
#' the output directory. The archive_control parameters are
#' \describe{
#' \item{archive_type}{a string giving the method used to
#' archive the directory. The acceptable values are
#' "tar" and "none". The directory is not archived when
#' archive_type is "none". The default is "tar".}
#' \item{archive_compression}{a string giving the type of
#' compression applied to the archive file. The acceptable
#' values are "none", "gzip", "bzip2", and "xz". The
#' default is "none".}
#' }
#'
#' @section Notes:
#' \itemize{
#' \item{The R tar archive function used by Monocle3 may have
#' a limited output file size of 8 GB. If you encounter
#' this problem, you can set the environment variable
#' "tar" to a tar executable that has no size limit,
#' for example, gnu tar. You can do this in the
#' $HOME/.monoclerc file by adding a line consisting of
#' Sys.setenv('tar' = paste(Sys.getenv("TAR"), "-H", "gnu")).
#' See the R 'tar' documentation for more information.}
#' }
#' @return none.
#'
#' @examples
#' \dontrun{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' cds <- reduce_dimension(cds)
#' save_transform_models(cds, 'tm')
#' }
#'
#' @export
# Bioconductor forbids writing to user directories so examples
# is not run.
save_transform_models <- function( cds, directory_path, comment="", verbose=TRUE, archive_control=list()) {
assertthat::assert_that(is.list(archive_control),
msg = 'save_transform_models: invalid archive_control parameter')
archive_control <- set_archive_control(archive_control)
assertthat::assert_that(archive_control[['archive_type']] %in% c('tar', 'none'),
msg=paste0("archive_type must be either \'none\' or \'tar\'"))
assertthat::assert_that(archive_control[['archive_compression']] %in% c('gzip', 'bzip2', 'xz', 'none'),
msg=paste0("archive_compression must be \'none\', \'gzip\', \'bzip2\', or \'xz\'."))
# Make a 'normalized' path string. The annoy index save function does not
# recognize tildes.
directory_path <- normalizePath(directory_path, mustWork=FALSE)
# file information is written to an RDS file
# in directory_path
# cds_object: reduce_dim_aux
# reduction_method: PCA | LSI | Aligned | UMAP ...
# nn_method: annoy | hnsw
# object_spec: ex: cds@reduce_dim_aux[[reduction_method]]
# file_format: rds | annoy_index | umap_annoy_index | hnsw_index
# file_path: path within directory_path (need file name)
# file_md5sum: md5sum of file(s)
file_index <- list( 'save_function' = 'save_transform_models',
'archive_date' = Sys.time(),
'r_version' = R.Version()$version.string,
'uwot_version' = utils::packageVersion('uwot'),
'hnsw_version' = utils::packageVersion('RcppHNSW'),
'monocle_version' = utils::packageVersion('monocle3'),
'cds_version' = S4Vectors::metadata(cds)$cds_version,
'archive_version' = get_global_variable('transform_models_version'),
'directory' = directory_path,
'comment' = comment,
'files' = data.frame(cds_object = character(0),
reduction_method = character(0),
object_spec = character(0),
file_format = character(0),
file_path = character(0),
file_md5sum = character(0),
stringsAsFactors = FALSE))
# Gather reduce_dimension reduction_methods and whether each has an annoy index.
methods_reduce_dim <- list()
for( reduction_method in names(cds@reduce_dim_aux)) {
methods_reduce_dim[[reduction_method]] <- list()
methods_reduce_dim[[reduction_method]][['rds_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model.rds')
methods_reduce_dim[[reduction_method]][['annoy_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_annoy.idx')
methods_reduce_dim[[reduction_method]][['hnsw_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_hnsw.idx')
if(reduction_method == 'UMAP') {
if(has_nn_index(cds, 'umap_model_annoy')) {
methods_reduce_dim[[reduction_method]][['has_model_index']] <- TRUE
methods_reduce_dim[[reduction_method]][['umap_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_umap.idx')
}
else
methods_reduce_dim[[reduction_method]][['has_model_index']] <- FALSE
}
if(has_nn_index(cds, paste0(tolower(reduction_method), '_search_annoy')))
methods_reduce_dim[[reduction_method]][['has_annoy_index']] <- TRUE
else
methods_reduce_dim[[reduction_method]][['has_annoy_index']] <- FALSE
if(has_nn_index(cds, paste0(tolower(reduction_method), '_search_hnsw')))
methods_reduce_dim[[reduction_method]][['has_hnsw_index']] <- TRUE
else
methods_reduce_dim[[reduction_method]][['has_hnsw_index']] <- FALSE
}
# Make directory if necessary.
dir.create(path = directory_path, showWarnings=FALSE, recursive=TRUE, mode='0777')
# Remove files, if they exist.
for(reduction_method in names(methods_reduce_dim)) {
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['rds_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['rds_path']]))
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']]))
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']]))
if(reduction_method == 'UMAP') {
if(methods_reduce_dim[[reduction_method]][['has_model_index']]) {
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']]))
}
}
}
# Save reduce_dimension annoy indices.
# Notes:
# o save RDS files before the corresponding index files in
# order to enable loading.
#
for(reduction_method in names(methods_reduce_dim)) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['rds_path']])
tryCatch(
{
base::saveRDS(cds@reduce_dim_aux[[reduction_method]], file=file_path)
},
error = function(c) { stop(paste0(trimws(c),
'\n error writing file ', file_path,
'\n', dbar40,
'\n', report_path_status(dirname(file_path)),
'\n', dbar40,
'\n* error in save_transform_models')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in save_transform_models'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]]),
file_format = 'rds',
file_path = methods_reduce_dim[[reduction_method]][['rds_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
if(methods_reduce_dim[[reduction_method]][['has_annoy_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']])
tryCatch(
save_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']], file_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_models')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in save_transform_models'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']]),
file_format = 'annoy_index',
file_path = methods_reduce_dim[[reduction_method]][['annoy_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
if(methods_reduce_dim[[reduction_method]][['has_hnsw_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']])
tryCatch(
save_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']], file_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_model')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in save_transform_models'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']]),
file_format = 'hnsw_index',
file_path = methods_reduce_dim[[reduction_method]][['hnsw_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
if(reduction_method == 'UMAP' && methods_reduce_dim[[reduction_method]][['has_model_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']])
md5sum <- tryCatch(
save_umap_nn_indexes(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']], file_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_models')) })
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']]),
file_format = 'umap_annoy_index',
file_path = methods_reduce_dim[[reduction_method]][['umap_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
}
# Save file_index.rds.
tryCatch(base::saveRDS(file_index, file=file.path(directory_path, 'file_index.rds')),
error = function(c) { stop(paste0(trimws(c),
'\n error writing file ', file.path(directory_path, 'file_index.rds'),
'\n', dbar40,
'\n', report_path_status(dirname(file.path(directory_path, 'file_index.rds'))),
'\n', dbar40,
'\n* error in save_transform_models')) })
if(verbose) {
tryCatch(report_files_saved(file_index),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_models')) })
}
#
# Check for saved files.
#
tryCatch(check_monocle_object_files( directory_path, file_index, read_test=FALSE, verbose=verbose ),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_models')) })
# Make a tar file of output directory, if requested.
if(archive_control[['archive_type']] == 'tar') {
tryCatch(make_tar_of_dir(directory_path=directory_path, archive_control=archive_control),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_transform_models')) })
}
}
#
# Copy reduce_dim_aux slot objects from cds_src to cds_dst. The copy
# runs on the R objects only. It does not try to copy objects about
# which R knows nothing, such as annoy and hnsw indices. At this
# time, this function is used only by load_transform_models().
#
copy_reduce_dim_aux <- function(cds_dst, cds_src) {
for( reduction_method in names(cds_src@reduce_dim_aux@listData)) {
cds_dst@reduce_dim_aux[[reduction_method]] <- cds_src@reduce_dim_aux[[reduction_method]]
}
return(cds_dst)
}
#' Load transform models into a cell_data_set.
#'
#' Load transform models into a cell_data_set where the transform
#' models directory was made using either save_transform_models()
#' or save_monocle_objects(). This function over-writes existing
#' models in the cell_data_set. For more information see the
#' help information for save_transform_models() and
#' save_monocle_objects().
#'
#' @param cds a cell_data_set to be transformed using the models.
#' @param directory_path a string giving the name of the directory
#' from which to read the model files. The model file directory
#' is made by either save_transform_models() or
#' save_monocle_objects().
#'
#' @return a cell_data_set with the transform models loaded by
#' load_transform_models().
#'
#' @examples
#' \dontrun{
#' cds <- load_a549()
#' cds <- preprocess_cds(cds)
#' cds <- reduce_dimension(cds)
#' save_transform_models(cds, 'tm')
#' cds1 <- load_a549()
#' cds1 <- load_transform_models(cds1, 'tm')
#' }
#' @export
# Bioconductor forbids writing to user directories so examples
# is not run.
load_transform_models <- function(cds, directory_path) {
appendLF <- TRUE
# Make a 'normalized' path string. The annoy index save function does not
# recognize tildes.
directory_path <- normalizePath(directory_path, mustWork=FALSE)
# Check for directory.
if(!file.exists(directory_path))
stop('Directory \'', directory_path, '\' does not exist.')
# Check for file_index.rds.
file_index_path <- file.path(directory_path, 'file_index.rds')
if(!file.exists(file_index_path))
stop('Missing file index file \'', file_index_path, '\'')
# Read file index.
file_index <- tryCatch(readRDS(file_index_path),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_index_path,
'\n', dbar40,
'\n', report_path_status(file_index_path, dirname(file_index_path)),
'\n', dbar40,
'\n* error in load_transform_models')) })
# Check that this is a save_transform_models archive.
save_function <- file_index[['save_function']]
if(save_function != 'save_transform_models' && save_function != 'save_monocle_objects') {
stop('The files in ', directory_path, ' are not from either save_transform_models or save_monocle_objects.')
}
# Write stored comment field.
if(length(file_index[['comment']]) > 1) {
message('File comment: ', file_index[['comment']], appendLF=appendLF)
}
# Loop through the files in file_index.rds in order
# to restore objects.
for(ifile in seq_along(file_index[['files']][['cds_object']])) {
file_path <- file.path(directory_path, file_index[['files']][['file_path']][[ifile]])
file_format <- file_index[['files']][['file_format']][[ifile]]
cds_object <- file_index[['files']][['cds_object']][[ifile]]
reduction_method <- file_index[['files']][['reduction_method']][[ifile]]
md5sum <- file_index[['files']][['file_md5sum']][[ifile]]
# Don't try to load BPCells matrix.
if(save_function == 'save_monocle_objects' && cds_object == 'bpcells_matrix_dir') {
next
}
if(!file.exists(file_path))
stop('load_transform_models: missing file \'', file_path, '\'')
#
# For UWOT UMAP annoy index, the function load_umap_nn_indexes
# checks md5sums internally so don't check here.
#
check_md5 <- TRUE
if(cds_object == 'reduce_dim_aux' &&
reduction_method == 'UMAP' &&
file_format == 'umap_annoy_index') {
check_md5 <- FALSE
}
# Allow user to over-ride test.
if(is.na(md5sum)) {
check_md5 <- FALSE
}
if(check_md5) {
md5sum_file <- tools::md5sum(file_path)
if(is.na(md5sum_file)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_transform_models'))
}
if(md5sum_file != md5sum) {
stop(report_checksum_difference('load_transform_models', file_path, md5sum, md5sum_file))
}
}
#
# Note:
# o expect that the RDS file for a reduction_method
# (or cds) appears before index files for the
# reduction_method
#
if(cds_object == 'cds') {
if(file_format == 'rds') {
cds_tmp <- tryCatch(readRDS(file_path),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_transform_models')) })
cds <- copy_reduce_dim_aux(cds, cds_tmp)
rm(cds_tmp)
}
else
if(file_format == 'hdf5') {
stop('loading transform models from an HDF5 file is not supported')
}
}
else
if(cds_object == 'reduce_dim_aux') {
if(file_format == 'rds') {
cds@reduce_dim_aux[[reduction_method]] <- tryCatch(readRDS(file_path),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_transform_models')) })
}
else
if(file_format == 'annoy_index') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']] <- update_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']])
metric <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']][['metric']]
ncolumn <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']][['ncol']]
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']] <- tryCatch(
{
load_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']], file_path, metric, ncolumn)
},
error = function(c) { stop(paste0(trimws(c), '\n* error in load_transform_models')) })
}
else
if(file_format == 'hnsw_index') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']] <- update_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']])
metric <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']][['metric']]
ncolumn <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']][['ncol']]
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']] <- tryCatch(
{
load_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']], file_path, metric, ncolumn)
},
error = function(c) { stop(paste0(trimws(c), '\n* error in load_transform_models')) })
}
else
if(reduction_method == 'UMAP' && file_format == 'umap_annoy_index') {
cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']] <- tryCatch(
{
load_umap_nn_indexes(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']], file_path, md5sum)
},
error = function(c) { stop(paste0(trimws(c), '\n* error in load_transform_models')) })
}
else {
stop('Unrecognized file format value \'', file_format, '\'')
}
cds <- set_model_identity_path(cds, reduction_method, directory_path)
}
else {
stop('Unrecognized cds_object value \'', cds_object, '\'')
}
}
return(cds)
}
#
# Check cds assays for HDF5Array objects.
#
#' @importFrom S4Vectors getListElement
test_hdf5_assays <- function(cds) {
assays <- assays(cds)
for( idx in seq_along(assays)) {
asyl <- getListElement(assays, idx)
hdf5_test <- unlist(DelayedArray::seedApply(asyl, methods::is, "HDF5ArraySeed"))
if(any(unlist(hdf5_test))) return(TRUE)
}
FALSE
}
#
# Get md5 checksum value of a file in the BPCells matrix directory.
#
bpcells_matdir_md5 <- function(matrix_dir_path) {
matrix_dir_path <- normalizePath(matrix_dir_path, mustWork=FALSE)
file_name_list <- c('val', 'val_data')
for(file_name in file_name_list) {
file_path <- file.path(matrix_dir_path, file_name)
file_exists <- file.exists(file_path)
if(file_exists) {
md5sum <- tools::md5sum(file_path)
# md5sum returns NA for unreadable file.
return(md5sum)
}
}
stop(paste0('\n no BPCells checksum',
'\n', dbar40,
'\n', report_path_status(matrix_dir_path, dirname(matrix_dir_path)),
'\n', dbar40,
'\n* error in bpcells_matdir_md5'))
}
#
#' Save a Monocle3 full cell_data_set.
#'
#' Save a Monocle3 full cell_data_set to a specified directory
#' by writing the R objects to an RDS file, the nearest neighbor
#' indices to index files, and a BPCells matrix directory when
#' the counts matrix is stored in that format. This includes
#' the Annoy nearest neighbor index that UMAP creates and is
#' required for use with the reduce_dimension_transform()
#' function.
#'
#' @param cds a cell_data_set to save.
#' @param directory_path a string giving the name of the directory
#' in which to write the object files.
#' @param hdf5_assays a boolean determining whether the
#' non-HDF5Array assay objects are saved as HDF5 files. At this
#' time cell_data_set HDF5Array assay objects are stored as
#' HDF5Assay files regardless of the hdf5_assays parameter value.
#' @param comment a string with optional notes that is saved with
#' the objects.
#' @param verbose a boolean determining whether to print information
#' about the saved files.
#' @param archive_control a list that is used to control archiving
#' the output directory. The archive_control parameters are
#' \describe{
#' \item{archive_type}{a string giving the method used to
#' archive the directory. The acceptable values are
#' "tar" and "none". The directory is not archived when
#' archive_type is "none". The default is "tar".}
#' \item{archive_compression}{a string giving the type of
#' compression applied to the archive file. The acceptable
#' values are "none", "gzip", "bzip2", and "xz". The
#' default is "none".}
#' }
#' @section Notes:
#' \itemize{
#' \item{You must use save_monocle_objects() to save your
#' cell_data_set if you use BPCells to store the
#' counts matrix. Warning: if you use saveRDS() to
#' save a cell_data_set with a BPCells counts matrix
#' you will lose the counts matrix.}
#' \item{You must use save_monocle_objects() to save your
#' cell_data_set if you will use the output
#' directory for projection and label transfer. Warning:
#' if you use saveRDS() to save the cell_data_set,
#' you will lose the essential nearest neighbor indices.
#' Note that you can use the save_transform_models()
#' function to save the transform models and indices
#' without saving the full cell_data_set but you must
#' do this when the indices exist in the cell_data_set.}
#' \item{See the help information for save_transform_models()
#' for additional information about transform models.}
#' \item{Do not modify the files in the save_monocle_objects()
#' output directory. save_monocle_objects() calculates
#' and saves a checksum value for each file written and
#' load_monocle_objects() uses the checksums to make sure
#' that the files haven't changed. (Monocle3 does not
#' calculate a checksum for a BPCells matrix directory
#' and its contents.)}
#' \item{The assays objects are saved as HDF5Array files when
#' hdf5_assays=TRUE or when the cell_data_set assays are
#' HDF5Array objects. If any assay in the cell_data set is
#' an HDF5 object, all assays must be. When
#' save_monocle_objects() is run with hdf5_assays=TRUE,
#' the load_monocle_objects() function loads the saved
#' assays into HDF5Array objects in the resulting
#' cell_data_set. Note that functions such as
#' preprocess_cds() that are run on assays stored as
#' HDF5Arrays are much, much slower than the same
#' functions run on assays stored as in-memory or
#' BPCells matrices. You may want to investigate
#' parameters related to the Bioconductor DelayedArray
#' and BiocParallel packages in this case.}
#' \item{You cannot use hdf5_assays=TRUE when a cell_data_set
#' has a BPCells counts matrix.}
#' \item{It's not clear that there is a reason to use
#' hdf5_assays=TRUE.}
#' \item{save_monocle_objects() stops when an internal file
#' write function returns an error. This includes functions
#' that save a BPCells directory and functions that save
#' nearest neighbor indices. If this happens, we urge you to
#' fix the problem and then re-run save_monocle_objects()
#' without exiting R, if possible. These errors can happen
#' if you have too little free disk space or you don't have
#' permission to write to the output directory location.}
#' \item{The counts matrix is stored as a BPCells matrix when the
#' user gives the parameter
#' matrix_control=list(matrix_class="BPCells") in Monocle3
#' functions such as load_mm_data() and load_mtx_data().
#' Also, a BPCells counts matrix can be stored directly in
#' the assays slot of a cell_data_set using BPCells
#' functions such as import_matrix_market() and
#' write_matrix_dir(). (In this case, the Monocle3
#' new_cell_data_set() function stores a row-major copy of
#' the counts matrix too, which is used in certain Monocle3
#' functions.) save_monocle_objects() saves this BPCells
#' count matrix.}
#' \item{The UMAP functions makes an Annoy nearest neighbor
#' index internally, which is used for a UMAP
#' projection by the Monocle3 function
#' reduce_dimension_transform(). save_monocle_objects()
#' saves this Annoy index.}
#' \item{The Monocle3 preprocess_cds() and reduce_dimension()
#' functions make Annoy nearest neighbor indices when
#' run with the parameter build_nn_index=TRUE. These
#' indices can be used for label transfer with the
#' Monocle3 transfer_cell_labels() function.
#' save_monocle_objects() saves these Annoy indices.}
#' \item{The save_monocle_objects() output directory is not
#' removed after it is archived by save_monocle_objects().}
#' \item{The R tar archive function used by Monocle3 may have
#' a limited output file size of 8 GB. If you encounter
#' this problem, you can set the environment variable
#' "tar" to a tar executable that has no size limit,
#' for example, gnu tar. You can do this in the
#' $HOME/.monoclerc file by adding a line consisting of
#' Sys.setenv('tar' = paste(Sys.getenv("TAR"), "-H", "gnu")).
#' See the R 'tar' documentation for more information.}
#' \item{You can change the default archive_control list values
#' by defining the default in your $HOME/.monoclerc file.
#' For example, you can include the command
#' monocle3:::set_global_variable("archive_control", list(archive_type="none", archive_compression="none"))
#' to avoid making a tar file of the monocle objects
#' directory.}
#' }
#'
#' @return none.
#'
#' @examples
#' \dontrun{
#' cds <- load_a549()
#' save_monocle_objects(cds, 'mo')
#' }
#'
#' @export
#
# Bioconductor forbids writing to user directories so examples
# is not run.
#
# *** Warning: watch for changes to save_monocle_objects() that may ***
# *** break load_transform_models() because load_transform_models() ***
# *** can read a save_monocle_objects() output directory. ***
#
save_monocle_objects <- function(cds, directory_path, hdf5_assays=FALSE, comment="", verbose=TRUE, archive_control=list()) {
assertthat::assert_that(is.list(archive_control),
msg = 'save_transform_models: invalid archive_control parameter')
archive_control <- set_archive_control(archive_control)
assertthat::assert_that(archive_control[['archive_type']] %in% c('tar', 'none'),
msg=paste0("archive_type must be either \'none\' or \'tar\'"))
assertthat::assert_that(archive_control[['archive_compression']] %in% c('gzip', 'bzip2', 'xz', 'none'),
msg=paste0("archive_compression must be \'none\', \'gzip\', \'bzip2\', or \'xz\'."))
# Make a 'normalized' path string. The annoy index save function does not
# recognize tildes.
directory_path <- normalizePath(directory_path, mustWork=FALSE)
# file information is written to an RDS file
# in directory_path
# cds_object: cds | reduce_dim_aux
# reduction_method: PCA | LSI | Aligned | tSNE | UMAP ...
# nn_method: annoy | hnsw
# object_spec: ex: cds@reduce_dim_aux[[reduction_method]]
# file_format: rds | annoy_index | umap_annoy_index | hnsw_index
# file_path: path within directory_path (need file name)
# file_md5sum: md5sum of file(s)
file_index <- list( 'save_function' = 'save_monocle_objects',
'archive_date' = Sys.time(),
'r_version' = R.Version()$version.string,
'uwot_version' = utils::packageVersion('uwot'),
'hnsw_version' = utils::packageVersion('RcppHNSW'),
'hdf5array_version' = utils::packageVersion('HDF5Array'),
'bpcells_version' = utils::packageVersion('BPCells'),
'monocle_version' = utils::packageVersion('monocle3'),
'cds_version' = S4Vectors::metadata(cds)$cds_version,
'archive_version' = get_global_variable('monocle_objects_version'),
'directory' = directory_path,
'comment' = comment,
'files' = data.frame(cds_object = character(0),
reduction_method = character(0),
object_spec = character(0),
file_format = character(0),
file_path = character(0),
file_md5sum = character(0),
stringsAsFactors = FALSE))
# Save assays as HDF5Array objects?
hdf5_assay_flag <- hdf5_assays || test_hdf5_assays(cds)
# Save assays as BPCells Matrix_dir or BPCells 10xHDF5 file?
bpcells_matrix_dir_flag <- FALSE
matrix_info <- get_matrix_info(mat=counts(cds))
if(matrix_info[['matrix_class']] == 'BPCells' &&
matrix_info[['matrix_mode']] == 'dir') {
#
# Check that BPCells matrix directory exists.
# We use the bpcells_matrix_dir_flag to save the
# directory (later) and so we don't try to save
# the BPCells matrix files if the directory
# doesn't exist.
#
if(dir.exists(matrix_info[['matrix_path']])) {
bpcells_matrix_dir_flag <- TRUE
}
else {
message('Warning: save_monocle_objects: the CDS has a BPCells count matrix but\nthe BPCells count matrix directory is missing, which will likely\ncause problems for anyone using this file in the future.\nI\'m continuing without it.')
}
}
# hdf5_assays=TRUE is incompatible with BPCells count matrices.
if(bpcells_matrix_dir_flag == TRUE && hdf5_assays == TRUE) {
stop('save_monocle_objects: hdf5 must be FALSE when the cell_data_set\ncounts matrix is stored using BPCells.')
}
# Path of cds object file.
rds_path <- 'cds_object.rds'
hdf5_path <- 'hdf5_object'
bpcells_matrix_dir <- 'bpcells_matrix_dir'
# Gather reduce_dimension reduction_method names for which indices exist.
methods_reduce_dim <- list()
for(reduction_method in names(cds@reduce_dim_aux)) {
methods_reduce_dim[[reduction_method]] <- list()
# These are the nn search indices make when build_nn_index=TRUE.
# These names are confusing...
methods_reduce_dim[[reduction_method]][['annoy_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_annoy.idx')
methods_reduce_dim[[reduction_method]][['hnsw_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_hnsw.idx')
# This is the annoy index used internally by UMAP.
# These names are confusing...
if(reduction_method == 'UMAP') {
if(has_nn_index(cds, 'umap_model_annoy')) {
methods_reduce_dim[[reduction_method]][['has_model_index']] <- TRUE
methods_reduce_dim[[reduction_method]][['umap_index_path']] <- paste0('rdd_', tolower(reduction_method), '_transform_model_umap.idx')
}
else
methods_reduce_dim[[reduction_method]][['has_model_index']] <- FALSE
}
if(has_nn_index(cds, paste0(tolower(reduction_method), '_search_annoy')))
methods_reduce_dim[[reduction_method]][['has_annoy_index']] <- TRUE
else
methods_reduce_dim[[reduction_method]][['has_annoy_index']] <- FALSE
if(has_nn_index(cds, paste0(tolower(reduction_method), '_search_hnsw')))
methods_reduce_dim[[reduction_method]][['has_hnsw_index']] <- TRUE
else
methods_reduce_dim[[reduction_method]][['has_hnsw_index']] <- FALSE
}
# Make directory if necessary.
dir.create(path = directory_path, showWarnings=FALSE, recursive=TRUE, mode='0777')
# Remove files, if they exist.
# BPCells MatrixDir directory.
if(file.exists(file.path(directory_path, bpcells_matrix_dir)))
unlink(file.path(directory_path, bpcells_matrix_dir), recursive=TRUE)
# Reduction method related files.
for(reduction_method in names(methods_reduce_dim)) {
if(file.exists(file.path(directory_path, rds_path)))
file.remove(file.path(directory_path, rds_path))
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']]))
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']]))
if(reduction_method == 'UMAP') {
if(methods_reduce_dim[[reduction_method]][['has_model_index']]) {
if(file.exists(file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']])))
file.remove(file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']]))
}
}
}
#
# Save cds object.
# Notes:
# o allow for HDF5Array assay objects.
#
if(!hdf5_assay_flag) {
file_path <- file.path(directory_path, rds_path)
tryCatch(
base::saveRDS(cds, file_path),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_path,
'\n', dbar40,
'\n', report_path_status(dirname(file_path)),
'\n', dbar40,
'\n* save_monocle_objects')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* save_monocle_objects'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'cds',
reduction_method = NA,
object_spec = object_name_to_string(cds),
file_format = 'rds',
file_path = rds_path,
file_md5sum = md5sum,
stringsAsFactors = FALSE))
# Save BPCells MatrixDir, if required.
if(bpcells_matrix_dir_flag) {
bpcells_matrix_path <- file.path(directory_path, bpcells_matrix_dir)
mat <- counts(cds)
tryCatch(
BPCells::write_matrix_dir(mat=mat, dir=bpcells_matrix_path, compress=FALSE, buffer_size=8192L, overwrite=FALSE),
error = function(c) { stop(paste0(trimws(c),
'\n error writing directory ', bpcells_matrix_path,
'\n', dbar40,
'\n', report_path_status(dirname(bpcells_matrix_path)),
'\n', dbar40,
'\n* error in save_monocle_objects')) })
val_md5sum <- tryCatch(bpcells_matdir_md5(bpcells_matrix_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'bpcells_matrix_dir',
reduction_method = NA,
object_spec = object_name_to_string(mat),
file_format = 'BPCells:MatrixDir',
file_path = bpcells_matrix_dir,
file_md5sum = val_md5sum,
stringsAsFactors = FALSE))
}
}
else {
if(bpcells_matrix_dir_flag) {
stop('save_monocle_objects: a BPCells count matrix cannot be saved as an HDF5 file.', call.=FALSE)
}
file_path <- file.path(directory_path, hdf5_path)
tryCatch(
HDF5Array::saveHDF5SummarizedExperiment(cds, file_path, replace=TRUE),
error = function(c) { stop(paste0(trimws(c),
'\n error writing ', file_path,
'\n', dbar40,
'\n', report_path_status(dirname(file_path)),
'\n', dbar40,
'\n* error in save_monocle_objects')) })
md5sum <- tools::md5sum(file.path(directory_path, hdf5_path, 'se.rds'))
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file.path(directory_path, hdf5_path, 'se.rds'),
'\n', dbar40,
'\n', report_path_status(file.path(directory_path, hdf5_path, 'se.rds'), dirname(file.path(directory_path, hdf5_path, 'se.rds'))),
'\n', dbar40,
'\n* error in save_monocle_objects'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'cds',
reduction_method = NA,
object_spec = object_name_to_string(cds),
file_format = 'hdf5',
file_path = hdf5_path,
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
# Save reduce_dimension annoy indices.
# Notes:
# o save RDS files before the corresponding index files in
# order to enable loading.
#
for(reduction_method in names(methods_reduce_dim)) {
if(methods_reduce_dim[[reduction_method]][['has_annoy_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['annoy_index_path']])
tryCatch(
save_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']], file_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path),
'\n', dbar40,
'\n* error in save_monocle_objects'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']]),
file_format = 'annoy_index',
file_path = methods_reduce_dim[[reduction_method]][['annoy_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
if(methods_reduce_dim[[reduction_method]][['has_hnsw_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['hnsw_index_path']])
tryCatch(
save_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']], file_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
md5sum <- tools::md5sum(file_path)
if(is.na(md5sum)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path),
'\n', dbar40,
'\n* error in save_monocle_objects'))
}
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']]),
file_format = 'hnsw_index',
file_path = methods_reduce_dim[[reduction_method]][['hnsw_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
if(reduction_method == 'UMAP' && methods_reduce_dim[[reduction_method]][['has_model_index']]) {
file_path <- file.path(directory_path, methods_reduce_dim[[reduction_method]][['umap_index_path']])
md5sum <- tryCatch(
save_umap_nn_indexes(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']], file_path),
error = function(cond) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
file_index[['files']] <- rbind(file_index[['files']],
data.frame(cds_object = 'reduce_dim_aux',
reduction_method = reduction_method,
object_spec = object_name_to_string(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']]),
file_format = 'umap_annoy_index',
file_path = methods_reduce_dim[[reduction_method]][['umap_index_path']],
file_md5sum = md5sum,
stringsAsFactors = FALSE))
}
}
# Save file_index.rds.
tryCatch(base::saveRDS(file_index, file=file.path(directory_path, 'file_index.rds')),
error = function(c) {stop(paste0(trimws(c),
'\n unable to save file ', file.path(directory_path, 'file_index.rds'),
'\n', dbar40,
'\n', report_path_status(directory_path)),
'\n', dbar40,
'\n* error in save_monocle_objexts') })
if(verbose) {
tryCatch(report_files_saved(file_index),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
}
#
# Check for saved files.
#
tryCatch( check_monocle_object_files( directory_path, file_index, read_test=FALSE, verbose=verbose ),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
# Make a tar file of output directory, if requested.
if(archive_control[['archive_type']] == 'tar') {
tryCatch(make_tar_of_dir(directory_path=directory_path, archive_control=archive_control),
error = function(c) { stop(paste0(trimws(c), '\n* error in save_monocle_objects')) })
}
}
#
#' Load a full Monocle3 cell_data_set.
#'
#' Load a full Monocle3 cell_data_set, which was saved using
#' save_monocle_objects(). For more information read the help
#' information for save_monocle_objects().
#'
#' @param directory_path a string giving the name of the directory
#' from which to read the saved cell_data_set files.
#' @param matrix_control a list that is used only to set the
#' BPCells matrix path when the saved cell_data_set has the
#' counts matrix stored as a BPCells on-disk matrix. By default,
#' the BPCells matrix directory path is set to the current
#' working directory.
#' @return a cell_data_set.
#'
#' @examples
#' \dontrun{
#' cds <- load_a549()
#' save_monocle_objects(cds, 'mo')
#' cds1 <- load_monocle_objects('mo')
#' }
#'
#' @export
# Bioconductor forbids writing to user directories so examples
# is not run.
load_monocle_objects <- function(directory_path, matrix_control=list()) {
appendLF <- FALSE
#
# Prepare matrix_control_res.
# Notes:
# o we use only the matrix_control[['matrix_path']] value at this time for
# making the BPCells temporary matrix directory used while Monocle runs.
# All other matrix_control values are ignored.
#
matrix_control_default <- get_global_variable('matrix_control_bpcells_unrestricted')
matrix_control_res <- set_matrix_control(matrix_control=matrix_control, matrix_control_default=matrix_control_default, control_type='unrestricted')
# Make a 'normalized' path string. The annoy index save function does not
# recognize tildes.
directory_path <- normalizePath(directory_path, mustWork=FALSE)
# Check for directory.
if(!file.exists(directory_path))
stop('Directory or file \'', directory_path, '\' does not exist.')
# Check for file_index.rds. If none, assume that 'directory_path' is
# an RDS file.
file_index_path <- file.path(directory_path, 'file_index.rds')
if(!file.exists(file_index_path)) {
cds <- tryCatch(load_monocle_rds(directory_path),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_monocle_objects')) })
return(cds)
}
# Read file index.
file_index <- tryCatch(readRDS(file_index_path),
error = function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_index_path,
'\n', dbar40,
'\n', report_path_status(file_index_path, dirname(file_index_path)),
'\n', dbar40,
'\n* error in load_monocle_objects')) })
# Check that this is a save_monocle_objects archive.
if(file_index[['save_function']] != 'save_monocle_objects') {
stop('The files in ', directory_path, ' are not from save_monocle_objects.')
}
# Write stored comment field.
if(length(file_index[['comment']]) > 1) {
message('File comment: ', file_index[['comment']], appendLF=appendLF)
}
# Loop through the files in file_index.rds in order
# to restore objects.
for(ifile in seq_along(file_index[['files']][['cds_object']])) {
file_path <- file.path(directory_path, file_index[['files']][['file_path']][[ifile]])
file_format <- file_index[['files']][['file_format']][[ifile]]
cds_object <- file_index[['files']][['cds_object']][[ifile]]
reduction_method <- file_index[['files']][['reduction_method']][[ifile]]
md5sum <- file_index[['files']][['file_md5sum']][[ifile]]
if(!file.exists(file_path))
stop('load_monocle_objects: missing file \'', file_path, '\'')
#
# The functions load_umap_nn_indexes and
# loadHDF5SummarizedExperiment check md5sums
# internally so don't check here.
#
check_md5 <- TRUE
if(cds_object == 'reduce_dim_aux' &&
reduction_method == 'UMAP' &&
file_format == 'umap_annoy_index') {
check_md5 <- FALSE
}
if(file_format == 'hdf5') {
check_md5 <- FALSE
}
if(cds_object == 'bpcells_matrix_dir') {
check_md5 <- FALSE
}
# Allow user to over-ride test.
if(is.na(md5sum)) {
check_md5 <- FALSE
}
if(check_md5) {
md5sum_file <- tools::md5sum(file_path)
if(is.na(md5sum_file)) {
stop(paste0('\n no checksum for file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_monocle_objects'))
}
if(md5sum_file != md5sum) {
stop(report_checksum_difference('load_monocle_objects', file_index[['files']][['file_path']][[ifile]], md5sum, md5sum_file))
}
}
#
# Note:
# o expect that the RDS file for a reduction_method
# appears before index files for the reduction_method
#
if(cds_object == 'cds') {
if(file_format == 'rds') {
cds <- tryCatch(readRDS(file_path),
error = function(c) {stop(paste0(trimws(c),
'\n unable to read file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_monocle_objects')) })
}
else
if(file_format == 'hdf5') {
cds <- tryCatch(
HDF5Array::loadHDF5SummarizedExperiment(file_path),
error = function(c) {stop(paste0(trimws(c),
'\n unable to read file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_monocle_objects')) })
}
else {
stop('Unrecognized cds format value \'', file_format, '\'')
}
}
else
if(cds_object == 'reduce_dim_aux') {
if(file_format == 'annoy_index') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']] <- update_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']])
metric <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']][['metric']]
ncolumn <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']][['ncol']]
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']] <- tryCatch(
load_annoy_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['annoy']][['nn_index']], file_path, metric, ncolumn),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_monocle_objects')) })
}
else
if(file_format == 'hnsw_index') {
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']] <- update_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']])
metric <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']][['metric']]
ncolumn <- cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']][['ncol']]
cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']] <- tryCatch(
load_hnsw_index(cds@reduce_dim_aux[[reduction_method]][['nn_index']][['hnsw']][['nn_index']], file_path, metric, ncolumn),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_monocle_objects')) })
}
else
if(reduction_method == 'UMAP' && file_format == 'umap_annoy_index') {
cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']] <- tryCatch(
load_umap_nn_indexes(cds@reduce_dim_aux[[reduction_method]][['model']][['umap_model']], file_path, md5sum),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_monocle_objects')) })
}
else {
stop('Unrecognized file format value \'', file_format, '\'')
}
cds <- set_model_identity_path(cds, reduction_method, directory_path)
}
else
if(cds_object == 'bpcells_matrix_dir') {
if(!is.null(assay(cds, 'counts_row_order'))) {
assay(cds, 'counts_row_order') <- NULL
}
counts(cds, bpcells_warn=FALSE ) <- tryCatch(
load_bpcells_matrix_dir(file_path, md5sum, matrix_control=matrix_control_res),
error = function(c) { stop(paste0(trimws(c), '\n* error in load_monocle_objects')) })
# Rebuild the BPCells row-major order counts matrix.
cds <- set_cds_row_order_matrix(cds=cds)
}
else {
stop('Unrecognized cds_object value \'', cds_object, '\'')
}
}
return(cds)
}
#
# Operations from Monocle3 documentation
# o expression matrix
# o PCA: svd_v svd_sdev and gene_loadings and prop_var_expl
# o LSI: ?
# o Aligned: beta array and ?
# o cell clustering
# o marker genes
# o cell annotations
# o find_gene_modules
# o Garnett annotations?
# o learn_graph()
# o order_cells()
# o fit_models() DE regression analysis
# o graph_test() DE graph-autocorrelation analysis
# o Cicero annotations?
#
# Objects on loading (initial)
# o @ reduce_dim_aux
# o @ principal_graph_aux
# o @ principal_graph
# o @ clusters
# o @ int_elementMetadata
# o @ int_colData
# o @ int_metadata
# o @ rowRanges
# o @ colData
# o @ assays
# o @ NAMES
# o @ elementMetadata
# o @ metadata
# o $ preprocess_aux (vestigial now)
#
# Objects after analysis consisting of
# cds <- preprocess_cds(cds, num_dim = 100)
# cds <- align_cds(cds, num_dim = 100, alignment_group = "plate")
# cds <- reduce_dimension(cds)
# cds <- cluster_cells(cds, resolution=1e-5)
# marker_test_res <- top_markers(cds, group_cells_by="partition", reference_cells=1000, cores=8)
# colData(cds)$assigned_cell_type <- as.character(partitions(cds))
# cds_subset <- choose_cells(cds)
# pr_graph_test_res <- graph_test(cds_subset, neighbor_graph="knn", cores=8)
# pr_deg_ids <- row.names(subset(pr_graph_test_res, morans_I > 0.01 & q_value < 0.05))
# gene_module_df <- find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-3)
# cds_subset <- cluster_cells(cds_subset, resolution=1e-2)
#
# o @ reduce_dim_aux
# o @ principal_graph_aux
# o @ principal_graph
# o @ clusters
# o @ int_elementMetadata
# o @ int_colData
# o @ int_metadata
# o @ rowRanges
# o @ colData
# o @ assays
# o @ NAMES
# o @ elementMetadata
# o @ metadata
# o $ preprocess_aux (empty)
#
# Objects after analysis of
# cds <- load_monocle_rds('packer_embryo.load.rds')
# cds <- preprocess_cds(cds, num_dim = 50)
# cds <- align_cds(cds, alignment_group = "batch", residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading")
# cds <- reduce_dimension(cds)
# cds <- cluster_cells(cds)
# cds <- learn_graph(cds)
# cds <- order_cells(cds)
# ciliated_genes <- c("che-1", "hlh-17", "nhr-6", "dmd-6", "ceh-36", "ham-1")
# cds_subset <- cds[rowData(cds)$gene_short_name %in% ciliated_genes,]
# gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time")
# subset_pr_test_res <- graph_test(cds_subset, neighbor_graph="principal_graph", cores=4)
# pr_deg_ids <- row.names(subset(subset_pr_test_res, q_value < 0.05))
#
# o @ reduce_dim_aux
# o @ principal_graph_aux
# o @ principal_graph
# o @ clusters
# o @ int_elementMetadata
# o @ int_colData
# o @ int_metadata
# o @ rowRanges
# o @ colData
# o @ assays
# o @ NAMES
# o @ elementMetadata
# o @ metadata
#
# Strategy
# o copy slots
# o @ assays
# o @ colData
# o @ rowRanges
# o @ int_metadata ?
# o @ int_colData
# o @ int_elementMetadata
# o @ clusters
# o @ principal_graph
# o @ principal_graph_aux
# o @ reduce_dim_aux
# o transfer
# o @ preprocess_aux -> reduce_dim_aux (done)
#
# Issues
# o deal with missing values/objects; e.g., projection related and nearest neighbors
# o no annoy indices
# o no models initially; partial models after loading
# o no model identities
# o affected functions
# o projection functions
# o save/load models
# o save/load monocle objects?
# o gene_loadings to svd_v and svd_sdev conversion? (done)
# o the user may save a cds made using the recent Monocle3 version
# and load it using load_monocle_objects. In that case, load the
# cds using readRDS and return it unmodified. Use a model_version
# check?
# o can there be an inconsistency between the model and matrix identity
# version information?
load_monocle_rds <- function(file_path) {
appendLF <- TRUE
cds_tmp <- tryCatch(readRDS(file_path),
error=function(c) { stop(paste0(trimws(c),
'\n error reading file ', file_path,
'\n', dbar40,
'\n', report_path_status(file_path, dirname(file_path)),
'\n', dbar40,
'\n* error in load_monocle_rds')) })
cds <- cds_tmp
if(!is.null(SingleCellExperiment::reducedDims(cds_tmp)[['PCA']])) {
if(is.null(cds_tmp@reduce_dim_aux[['PCA']][['model']][['identity']])) {
cds <- initialize_reduce_dim_model_identity(cds, 'PCA')
}
if(!is.null(attr(cds, which='preprocess_aux')) && !is.null(cds_tmp@preprocess_aux[['gene_loadings']])) {
gene_loadings <- cds_tmp@preprocess_aux[['gene_loadings']]
svd_sdev <- apply(gene_loadings, 2, function(x) {sqrt(sum(x^2))})
svd_v <- gene_loadings %*% diag(1.0/svd_sdev)
colnames(svd_v) <- paste("PC", seq(1, ncol(svd_v)), sep="")
cds@reduce_dim_aux[['PCA']][['model']][['svd_sdev']] <- svd_sdev
cds@reduce_dim_aux[['PCA']][['model']][['svd_v']] <- svd_v
cds@reduce_dim_aux[['PCA']][['model']][['prop_var_expl']] <- cds_tmp@preprocess_aux[['prop_var_expl']]
}
}
if(!is.null(SingleCellExperiment::reducedDims(cds_tmp)[['LSI']])) {
if(is.null(cds_tmp@reduce_dim_aux[['LSI']][['model']][['identity']])) {
cds <- initialize_reduce_dim_model_identity(cds, 'LSI')
}
if(!is.null(attr(cds, which='preprocess_aux')) && !is.null(cds_tmp@preprocess_aux[['gene_loadings']])) {
gene_loadings <- cds_tmp@preprocess_aux[['gene_loadings']]
svd_sdev <- apply(gene_loadings, 2, function(x) {sqrt(sum(x^2))})
cds@reduce_dim_aux[['LSI']][['model']][['svd_v']] <- gene_loadings %*% diag(1.0/svd_sdev)
cds@reduce_dim_aux[['LSI']][['model']][['svd_sdev']] <- svd_sdev
}
}
if(!is.null(SingleCellExperiment::reducedDims(cds_tmp)[['Aligned']])) {
if(is.null(cds_tmp@reduce_dim_aux[['Aligned']][['model']][['identity']])) {
cds <- initialize_reduce_dim_model_identity(cds, 'Aligned')
}
if(!is.null(attr(cds, which='preprocess_aux')) && !is.null(cds_tmp@preprocess_aux$beta)) {
cds@reduce_dim_aux[['Aligned']][['model']][['beta']] <- cds_tmp@preprocess_aux$beta
}
}
if(!is.null(SingleCellExperiment::reducedDims(cds_tmp)[['tSNE']])) {
if(is.null(cds_tmp@reduce_dim_aux[['tSNE']][['model']][['identity']])) {
cds <- initialize_reduce_dim_model_identity(cds, 'tSNE')
}
}
if(!is.null(SingleCellExperiment::reducedDims(cds_tmp)[['UMAP']])) {
if(is.null(cds_tmp@reduce_dim_aux[['UMAP']][['model']][['identity']])) {
cds <- initialize_reduce_dim_model_identity(cds, 'UMAP')
}
}
if(is.null(SingleCellExperiment::int_metadata(cds_tmp)[['counts_metadata']])) {
cds <- initialize_counts_metadata(cds)
}
# The RDS file may have a preprocess_aux slot, which doesn't
# exist in the current cell_data_set class. The attr command
# appears to remove it whereas setting cds@preprocess_cds
# and cds$preprocess_cds to NULL do not.
if(!is.null(attr(cds, which='preprocess_aux'))) {
attr(cds, which='preprocess_aux') <- NULL
}
rm(cds_tmp)
return(cds)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.