R/matrix.R

Defines functions convert_counts_matrix check_bpcells_counts_matrix_pair set_cds_row_order_matrix rm_bpcells_dir set_matrix_class set_matrix_citation compare_matrix_control show_matrix_info get_matrix_info get_matrix_class bpcells_find_base_matrix push_matrix_path show_matrix_control set_matrix_control set_matrix_control_default check_matrix_control select_matrix_parameter_value

Documented in convert_counts_matrix set_cds_row_order_matrix set_matrix_control

################################################################################
# BPCells
################################################################################

# Notes:
#   o  the SummarizedExperiment Assay slot information states
#        o  the developer needs to be careful to implement endomorphisms with copy-on-modify semantics
#        o  an Assays concrete subclass needs to implement lossless back and forth coercion from/to
#           SimpleList
# 
# Questions:
#   o  are MatrixDir directories relocatable? The directory is not relocatable while the R MatrixDir
#      object exists. The saved directory that persists after the R session ends can be relocated before
#      being 'reloaded' with the open_matrix_dir() function.
#   o  do BPCells matrices 'respect' copy-on-modify semantics?
#   o  how does one deal with copy-on-semantics and BPCells queued (delayed) operations?
# 
# Ben Parks notes:
# 
#   o  on copy-on-modify semantics
#       First, for the copy-on-modify semantics -- I think BPCells is fine in this respect. As a
#        general rule, BPCells does not keep open file handles sitting around, and files are
#        re-opened each time a data-reading operation is taking place then closed at the end of
#        the operation. Additionally, the only way to modify the contents of an existing matrix
#        on disk or in memory is to write a new matrix while explicitly asking to overwrite the
#        old one. The end result is that the on-disk data will (intentionally) persist across R
#        sessions, and multiple R objects can happily read from the same data source without
#        worrying that one of them will accidentally modify data on disk. Since BPCells objects
#        are basically plain R objects with one field listing a file path, modifications to the
#        BPCells objects automatically support copy-on-modify semantics. The only way to break
#        that is by intentionally changing the contents of the underlying files on disk, which
#        BPCells will never do unless explicitly asked to overwrite data.
# 
#        The only note I'd give is BPCells doesn't support the [<- operator, which would
#        otherwise be the most problematic. Row and column names can be modified, but the
#        modifications take place only in R and only go to disk when explicitly writing the
#        matrix again.
# 
#   o on getting BPCells matrix path/type
# 
#        As for getting the directory path from queued operations, your tactic should mostly
#        work with the exception of sparse matrix multiplies or rbind/cbind operations.
#        There's currently an unexported method called matrix_inputs which does basically the
#        same thing -- it unwraps a single layer of queued operations returning a list of inputs
#        (usually but not always length 1). I might make a helper function that can perform this
#        recursively to get a list of the raw matrix inputs across arbitrarily many operations.
#        For now, I'd recommend trying to directly call the unexported method
#        BPCells:::matrix_inputs within your bpcells_find_base_matrix function, and we can swap
#        that for a more robust and exported method down the line.
# 
# BPCells methods (not all)
#      length(mat)
#      length(mat[,1])
#      length(mat[1,])
#      names(mat)
#      names(mat[,1])
#      names(mat[1,])
#      dim(mat)
#      rownames(mat)
#      colnames(mat)
#      dimnames(mat)
#      storage_order(mat)
#      rowSums(mat)
#      colSums(mat)
#      rowMeans(mat)
#      colMeans(mat)
#      selection_index()
#      rbind2()
#      cbind2()
#      transpose_storage_order(mat)
#      write_matrix_memory()
#      write_matrix_dir()
#      open_matrix_dir()
#      write_matrix_hdf5()
#      open_matrix_hdf5()
#      open_matrix_10x_hdf5()
#      write_matrix_10x_hdf5()
#      open_matrix_anndata_hdf5()
#      convert_matrix_type()
#      matrix_stats()


# select_matrix_parameter_value() is used by set_matrix_control to select matrix_control values.
select_matrix_parameter_value <- function(parameter, matrix_control, matrix_control_default, default_value) {
  if(!is.null(matrix_control[[parameter]])) {
    return(matrix_control[[parameter]])
  }
  else
  if(!is.null(matrix_control_default[[parameter]])) {
    return(matrix_control_default[[parameter]])
  }
  return(default_value)
}


# Usage
#   matrix_class: default: 'dgCMatrix'
#   matrix_class: 'BPCells'
#     matrix_mode: 'mem'  default: 'dir' NOTE: disallow matrix_mode 'mem'
#       matrix_type: 'float', 'double'  default: 'double'
#       matrix_compress: TRUE, FALSE default: FALSE
#       matrix_bpcells_copy: TRUE, FALSE default: TRUE
#     matrix_mode: 'dir'
#       matrix_type: 'float', 'double' default: 'double'
#       matrix_path: <path to directory or file> default: 'NULL' -> temporary directory in pwd
#       matrix_compress: TRUE, FALSE default: FALSE
#       matrix_buffer_size: <integer> default: 8192L
#       matrix_bpcells_copy: TRUE, FALSE default: TRUE
#
#  parameter check_conditional is boolean: conditional=TRUE is more stringent, requiring and checking certain
#    values conditioned on other values, for example, 'matrix_buffer_size' is used only when
#    matrix_class is 'BPCells' and matrix_mode is 'dir'. check_matrix_control(..., check_conditional=FALSE) is
#    called at the start of set_matrix_control() and check_matrix_control(..., check_conditional=TRUE) is
#    called at the end of set_matrix_control(). More generally, check_conditional=FALSE is used before trying
#    to set consistent matrix_control values and check_conditional=TRUE is used after trying to set consistent
#    matrix_control values.
#
check_matrix_control <- function(matrix_control=list(), control_type=c('unrestricted', 'pca'), check_conditional=FALSE) {
  control_type <- match.arg(control_type)
  assertthat::assert_that(is.list(matrix_control))
  assertthat::assert_that(is.logical(check_conditional))

  # If matrix_control list is zero length then return,
  # otherwise, matrix_control[['matrix_class']] must
  # be defined.
  if(length(matrix_control) == 0) {
    return()
  }
  else
  if(is.null(matrix_control[['matrix_class']])) {
    stop('matrix_control[[\'matrix_class\']] missing in matrix_control list.')
  }
  
  allowed_control_parameters <- c('matrix_class',
                                  'matrix_mode',
                                  'matrix_type',
                                  'matrix_compress',
                                  'matrix_path',
                                  'matrix_buffer_size',
                                  'matrix_bpcells_copy',
                                  'show_values')

  allowed_matrix_class <- c('dgCMatrix', 'BPCells')

  allowed_matrix_mode <- list()
  allowed_matrix_mode[['unrestricted']] <- c('dir')
  allowed_matrix_mode[['pca']] <- c('dir')

  allowed_matrix_type <- list()
  allowed_matrix_type[['unrestricted']] <- c('float', 'double')
  allowed_matrix_type[['pca']] <- c('float', 'double')

  allowed_matrix_compress <- list()
  allowed_matrix_compress[['unrestricted']] <- c(FALSE, TRUE)
  allowed_matrix_compress[['pca']] <- c(FALSE)

  error_string <- ''

  if(!all(names(matrix_control) %in% allowed_control_parameters)) {
    error_string <- paste0('invalid control parameter')
  }

  if(check_conditional == FALSE) {
    # Is matrix_class set and a valid value?
    if(!(is.null(matrix_control[['matrix_class']])) &&
       !(matrix_control[['matrix_class']] %in% allowed_matrix_class)) {
      error_string <- paste0('\ninvalid matrix_class "', matrix_control[['matrix_class']], '"')
    }

    # BPCells matrix class tests.
    if(matrix_control[['matrix_class']] == 'BPCells') {

      # Is matrix_mode set and a valid value?
      if(control_type == 'unrestricted')
        allowed_values <- allowed_matrix_mode[['unrestricted']]
      else
      if(control_type == 'pca')
        allowed_values <- allowed_matrix_mode[['pca']]
      else
        stop('check_matrix_control: unknown control type \'', control_type, '\'')
      if(!(is.null(matrix_control[['matrix_mode']])) &&
         !(matrix_control[['matrix_mode']] %in% allowed_values)) {
        error_string <- paste0('\ninvalid matrix_mode "', matrix_control[['matrix_mode']], '"')
      }

      # Is matrix_type set and a valid value?
      if(control_type == 'unrestricted')
        allowed_values <- allowed_matrix_type[['unrestricted']]
      else
      if(control_type == 'pca')
        allowed_values <- allowed_matrix_type[['pca']]
      else
        stop('check_matrix_control: unknown control type \'', control_type, '\'')
      if(!(is.null(matrix_control[['matrix_type']])) &&
         !(matrix_control[['matrix_type']] %in% allowed_values)) {
        error_string <- paste0('\ninvalid matrix_type "', matrix_control[['matrix_type']], '"')
      }

      # Is matrix_compress set and a data valid type?
      if(!(is.null(matrix_control[['matrix_compress']])) &&
         !(is.logical(matrix_control[['matrix_compress']]))) {
        error_string <- paste0('\nmatrix_compress value must be a logical type')
      }

      # Is matrix_path set and a data valid type?
      if(!(is.null(matrix_control[['matrix_path']])) &&
         !(is.character(matrix_control[['matrix_path']]))) {
        error_string <- paste0('\nmatrix_path value must be a character type')
      }

      # Is matrix_buffer_size set and a data valid type?
      if(!(is.null(matrix_control[['matrix_buffer_size']])) &&
         !(is.integer(matrix_control[['matrix_buffer_size']]))) {
        error_string <- paste0('\nmatrix_buffer_size value must be an integer type')
      }

      # Is matrix_bpcells_copy set and a valid data type?
      if(!(is.null(matrix_control[['matrix_bpcells_copy']])) &&
         !(is.logical(matrix_control[['matrix_bpcells_copy']]))) {
        error_string <- paste0('\nmatrix_bpcells_copy value must be a logical type')
      }
    }
  }
  else {
    # Is matrix_class set and a valid value?
    if(is.null(matrix_control[['matrix_class']])) {
      error_string <- '\nmatrix_control[[\'matrix_class\']] missing in matrix_control list.'
    }
    else
    if(!(matrix_control[['matrix_class']] %in% allowed_matrix_class)) {
      error_string <- paste0('\ninvalid matrix_class "', matrix_control[['matrix_class']], '\n')
    }

    # BPCells matrix class tests.
    if(matrix_control[['matrix_class']] == 'BPCells') {

      # Is matrix_type a valid value?
      if(control_type == 'unrestricted')
        allowed_values <- allowed_matrix_type[['unrestricted']]
      else
      if(control_type == 'pca')
        allowed_values <- allowed_matrix_type[['pca']]
      else
        stop('check_matrix_control: unknown control type \'', control_type, '\'')
      if(!(matrix_control[['matrix_type']] %in% allowed_values)) {
        error_string <- paste0('\nbad matrix_type "', matrix_control[['matrix_type']], '"\n')
      }
  
      # Check matrix_compress value.
      if(!is.logical(matrix_control[['matrix_compress']])) {
        error_string <- '\nmatrix_compress must be a logical type'
      }
      if(control_type == 'unrestricted')
        allowed_values <- allowed_matrix_compress[['unrestricted']]
      else
      if(control_type == 'pca')
        allowed_values <- allowed_matrix_compress[['pca']]
      else
        stop('check_matrix_control: unknown control type \'', control_type, '\'')
      if(!(matrix_control[['matrix_compress']] %in% allowed_values)) {
        error_string <- paste0('\nbad matrix_compress "', matrix_control[['matrix_compress']], '"\n')
      }
 
      # Is matrix_mode a valid value?
      if(control_type == 'unrestricted')
        allowed_values <- allowed_matrix_mode[['unrestricted']]
      else
      if(control_type == 'pca')
        allowed_values <- allowed_matrix_mode[['pca']]
      else
        stop('check_matrix_control: unknown control type \'', control_type, '\'')
      if(!(matrix_control[['matrix_mode']] %in% allowed_values)) {
        error_string <- paste0('\ninvalid matrix_mode "', matrix_control[['matrix_mode']], '"')
      }
  
      # Check values related to matrix_mode = 'dir'.
      if(matrix_control[['matrix_mode']] == 'dir') {

        # Is matrix_path a valid data type?
        if(!(is.character(matrix_control[['matrix_path']]))) {
          error_string <- paste0('\nbad matrix_path "', matrix_control[['matrix_path']], '"')
        }
  
        # Is matrix_buffer_size a valid data type?
        if(!(is.integer(matrix_control[['matrix_buffer_size']]))) {
          error_string <- paste0('\nmatrix_buffer_size must be an integer')
        }
      }

      # Is matrix_bpcells_copy a valid data type?
      if(!is.logical(matrix_control[['matrix_bpcells_copy']])) {
        error_string <- paste0('\nmatrix_bpcells_copy value must be a logical type')
      }
    }
  }

  if(error_string != '') {
    stop(paste0(stringr::str_trim(error_string), '\n'))
  }

  return(TRUE)
}


#
#  Return a default matrix_control list based on the matrix class and
#  the control_type value. If the matrix_control list is not empty, use
#  matrix_class in matrix_control, otherwise, use the global matrix_class_default
#  value.  The recognized control_type values are 'unrestricted' and 'pca'.
#  The returned default matrix_control list is used by the  set_matrix_control*()
#  functions.
#  Notes:
#    o  matrix_control[['matrix_class']] must be set in a matrix_control list.
#
set_matrix_control_default <- function(matrix_control=list(), control_type = c('unrestricted', 'pca')) {
  assertthat::assert_that(is.list(matrix_control),
                          msg = paste0('set_matrix_control_default: matrix_control parameter must be a list.'))

  control_type <- match.arg(control_type)

  if(length(matrix_control) == 0) {
    matrix_class <- get_global_variable('matrix_class_default')
  }
  else {
    if(is.null(matrix_control[['matrix_class']])) {
      stop(paste0('\n  matrix_control[[\'matrix_class\']] missing in matrix_control list.'))
    }
    matrix_class <- matrix_control[['matrix_class']]
  }
  if(matrix_class == 'dgCMatrix') {
    if(control_type == 'unrestricted') {
      matrix_control_default <- get_global_variable('matrix_control_csparsematrix_unrestricted')
    }
    else
    if(control_type == 'pca') {
      matrix_control_default <- get_global_variable('matrix_control_csparsematrix_pca')
    }
    else {
      stop(paste0('\n  unrecognized control_type value: ', control_type))
    }
  }
  else
  if(matrix_class == 'BPCells') {
    if(control_type == 'unrestricted') {
      matrix_control_default <- get_global_variable('matrix_control_bpcells_unrestricted')
    }
    else
    if(control_type == 'pca') {
      matrix_control_default <- get_global_variable('matrix_control_bpcells_pca')
    }
    else {
      stop(paste0('\n  unrecognized control_type value: ', control_type))
    }
  }
  else {
      stop(paste0('\n  unrecognized matrix_class value: ', matrix_class))
  }

  return(matrix_control_default)
}


# Usage
#   matrix_class: default: 'dgCMatrix'
#   matrix_class: 'BPCells'
#     matrix_mode: 'mem'  default: 'dir'
#       matrix_type: 'float', 'double' default: 'double'
#       matrix_compress: TRUE, FALSE default: FALSE
#       matrix_bpcells_copy: TRUE, FALSE default: TRUE
#     matrix_mode: 'dir'
#       matrix_type: 'float', 'double' default: 'double'
#       matrix_path: <path to directory or file> default: 'NULL' -> temporary directory in pwd
#       matrix_compress: TRUE, FALSE default: FALSE
#       matrix_buffer_size: <integer> default: 8192L
#       matrix_bpcells_copy: TRUE, FALSE default: TRUE
# Notes:
#   o  matrix_control[['matrix_class']] must be set in
#      a matrix_control list.

#' Verify and set the matrix_control parameter list.
#'
#' @description Verifies and sets the list of parameter values
#'   that is used to make the count matrix that is stored in the
#'   cell_data_set or certain other matrices that are used
#'   during the Monocle3 run. To see the
#'   default values,
#'   call "set_matrix_control(matrix_control=list(matrix_class='BPCells', show_values=TRUE))".
#'   "show_values=TRUE" can be used in functions that have the
#'   matrix_control list parameter, in which case the function will
#'   show the matrix_control values to be used and then stop.
#' @param matrix_control Input control list.
#' @param matrix_control_default Input default control list.
#' @param control_type A string of either "unrestricted" or "pca". A control_type of "pca"
#'   restricts certain list parameters.
#' @return matrix_control Output control list.
#'
#' @section matrix_control list values:
#' \describe{
#'   \item{matrix_class}{A string that specifies the matrix
#'      class to use for matrix storage. The acceptable
#'      values are "dgCMatrix" and "BPCells". matrix_class
#'      is required.}
#'   \item{matrix_type}{A string that specifies whether to
#'      store the matrix values as single precision "floats"
#'      (matrix_type="float") or double precision "doubles"
#'      (matrix_type="double"). The default is "double".
#'      "matrix_type" is used only for BPCells class matrices.}
#'   \item{matrix_mode}{A string that specifies whether to
#'      store the BPCells class matrix in memory
#'      (matrix_mode="mem") or on disk (matrix_mode="dir").
#'      "matrix_mode" is used only for BPCells class
#'      matrices. At this time, only "dir" is allowed.}
#'   \item{matrix_path}{A string that specifies the directory
#'      where the BPCells on-disk matrix data are stored in a
#'      sub-directory with a randomized name. The default is
#'      in the directory where R is running. "matrix_path" is
#'      used only for BPCells class matrices with
#'      matrix_mode="dir". For example, if matrix_path is set
#'      to "my_dir" Monocle3 will create a directory with a
#'      name that has the form "monocle.bpcells.*.tmp" in
#'      "my_dir". The asterisk represents a random string that
#'      makes the name unique.}
#'   \item{matrix_compress}{A logical that specifies whether
#'      to use bit-packing compression to store BPCells matrix
#'      values. Only the matrix indices are compressed for
#'      matrix_types "float" and "double". "matrix_compress"
#'      is used only for BPCells class matrices. The default is
#'      FALSE, which improves processing speed.}
#'   \item{matrix_buffer_size}{An integer that specifies how
#'      many items of data to buffer in memory before flushing
#'      to disk. This is used for matrix_class="BPCells" with
#'      matrix_mode="dir". The default is 8192L.}
#'   \item{matrix_bpcells_copy}{A logical that specifies
#'      whether the input BPCells matrix is to be copied. This
#'      is relevant only when the input matrix and the desired
#'      output matrix are the same; that is, have the same
#'      matrix_mode, matrix_path, matrix_compress, and
#'      matrix_buffer_size values. If matrix_bpcells_copy is
#'      TRUE, the queued operations are applied to a new
#'      on-disk copy of of the input matrix and the operation
#'      queue is emptied. If FALSE, the queued operations are
#'      not applied and the on-disk storage is unaltered.}
#' }
#' @export
set_matrix_control <- function(matrix_control=list(), matrix_control_default=list(), control_type=c('unrestricted', 'pca')) {
  assertthat::assert_that(is.list(matrix_control),
                          msg = paste0('set_matrix_control: matrix_control parameter must be a list.'))
  assertthat::assert_that(is.list(matrix_control_default),
                          msg = paste0('set_matrix_control: matrix_control_default parameter must be a list.'))

  control_type <- match.arg(control_type)

  # Set matrix_control_default when not given on command line:
  # use the command line matrix_control parameter to get the
  # matrix_class.
  if(length(matrix_control_default) == 0) {
    matrix_control_default <- tryCatch(set_matrix_control_default(matrix_control=matrix_control, control_type=control_type),
                                error = function(c) { stop(paste0(trimws(c), '\n* error in set_matrix_control')) })
  }

  tryCatch(check_matrix_control(matrix_control=matrix_control, control_type=control_type, check_conditional=FALSE),
    error = function(c) { stop(paste0(trimws(c), '\n* error in set_matrix_control')) })
  tryCatch(check_matrix_control(matrix_control=matrix_control_default, control_type=control_type, check_conditional=FALSE),
    error = function(c) { stop(paste0(trimws(c), '\n* error in set_matrix_control')) })

  #
  # Last resort fall-back parameter values.
  #
  default_matrix_class <- 'dgCMatrix'
  default_matrix_mode <- NULL
  default_matrix_type <- NULL
  default_matrix_compress <- NULL
  default_matrix_path <- NULL
  default_matrix_buffer_size <- NULL
  default_matrix_bpcells_copy <- NULL

  matrix_control_out = list()

  matrix_control_out[['matrix_class']] <- select_matrix_parameter_value(parameter='matrix_class', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_class)

  if(matrix_control_out[['matrix_class']] == 'BPCells') {
     matrix_control_out[['matrix_mode']] <- select_matrix_parameter_value(parameter='matrix_mode', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_mode)
    if(matrix_control_out[['matrix_mode']] == 'mem') {
       matrix_control_out[['matrix_type']] <- select_matrix_parameter_value(parameter='matrix_type', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_type)
       matrix_control_out[['matrix_compress']] <- select_matrix_parameter_value(parameter='matrix_compress', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_compress)
       matrix_control_out[['matrix_bpcells_copy']] <- select_matrix_parameter_value(parameter='matrix_bpcells_copy', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_bpcells_copy)
    }
    else
    if(matrix_control_out[['matrix_mode']] == 'dir') {
       matrix_control_out[['matrix_type']] <- select_matrix_parameter_value(parameter='matrix_type', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_type)
       matrix_control_out[['matrix_path']] <- select_matrix_parameter_value(parameter='matrix_path', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_path)
       matrix_control_out[['matrix_compress']] <- select_matrix_parameter_value(parameter='matrix_compress', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_compress)
       matrix_control_out[['matrix_buffer_size']] <- select_matrix_parameter_value(parameter='matrix_buffer_size', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_buffer_size)
       matrix_control_out[['matrix_bpcells_copy']] <- select_matrix_parameter_value(parameter='matrix_bpcells_copy', matrix_control=matrix_control, matrix_control_default=matrix_control_default, default_value=default_matrix_bpcells_copy)
    }

    # Restrict matrix_control values for matrices used in
    # intensive PCA calculations so set matrix_type from uint32_t
    # to double, set matrix_compress to FALSE for speed, and
    # matrix_bpcells_copy to TRUE because we want a temporary
    # matrix for the calculation, after which we remove it.
    if(control_type == 'pca') {
      if(matrix_control_out[['matrix_type']] == 'uint32_t') {
        message('set_matrix_control: forcing matrix_type to \'double\' for PCA.')
        matrix_control_out[['matrix_type']] <- 'double'
      }
      if(matrix_control_out[['matrix_compress']] == TRUE) {
        message('set_matrix_control: forcing matrix_compress to \'FALSE\' for PCA.')
        matrix_control_out[['matrix_compress']] <- FALSE
      }
      if(matrix_control_out[['matrix_bpcells_copy']] == FALSE) {
        message('set_matrix_control: forcing matrix_bpcells_copy to \'TRUE\' for PCA.')
        matrix_control_out[['matrix_bpcells_copy']] <- TRUE
      }
    }
  }

  tryCatch(check_matrix_control(matrix_control=matrix_control_out, control_type=control_type, check_conditional=TRUE),
    error = function(c) { stop(paste0(trimws(c), '\n* error in set_matrix_control')) })

  #
  # Set BPCells out-of-core file/directory name.
  #
  if(matrix_control_out[['matrix_class']] == 'BPCells' &&
     matrix_control_out[['matrix_mode']] == 'dir' &&
     is.null(matrix_control_out[['matrix_path']])) {
        matrix_control_out[['matrix_path']] <- '.'
  }

  #
  # Display matrix_control list if matrix_control[['show_values']] <- TRUE.
  #
  if(!is.null(matrix_control[['show_values']]) && matrix_control[['show_values']] == TRUE)
  {
    show_matrix_control(matrix_control=matrix_control_out, label='  matrix_control: ')
    stop_no_noise()
  }

  return(matrix_control_out)
}


# Report matrix_control list values.
show_matrix_control <- function(matrix_control=list(), label=NULL) {
  message('matrix_control:')
  indent <- ''
  if(!is.null(label)) {
    indent <- '  '
  }
  message(indent, '  matrix_class: ', ifelse(!is.null(matrix_control[['matrix_class']]), matrix_control[['matrix_class']], as.character(NA)))

  if(matrix_control[['matrix_class']] == 'dgCMatrix') {
    # nop
    invisible(NULL)
  }
  else
  if(matrix_control[['matrix_class']] == 'BPCells') {
    if(matrix_control[['matrix_mode']] == 'mem') {
      message(indent, '  matrix_mode: ', ifelse(!is.null(matrix_control[['matrix_mode']]), matrix_control[['matrix_mode']], as.character(NA)))
      message(indent, '  matrix_type: ', ifelse(!is.null(matrix_control[['matrix_type']]), matrix_control[['matrix_type']], as.character(NA)))
      message(indent, '  matrix_compress: ', ifelse(!is.null(matrix_control[['matrix_compress']]), matrix_control[['matrix_compress']], as.character(NA)))
      message(indent, '  matrix_bpcells_copy: ', ifelse(!is.null(matrix_control[['matrix_bpcells_copy']]), matrix_control[['matrix_bpcells_copy']], as.character(NA)))
    }
    else
    if(matrix_control[['matrix_mode']] == 'dir') {
      message(indent, '  matrix_mode: ', ifelse(!is.null(matrix_control[['matrix_mode']]), matrix_control[['matrix_mode']], as.character(NA)))
      message(indent, '  matrix_type: ', ifelse(!is.null(matrix_control[['matrix_type']]), matrix_control[['matrix_type']], as.character(NA)))
      message(indent, '  matrix_path: ', ifelse(!is.null(matrix_control[['matrix_path']]), matrix_control[['matrix_path']], as.character(NA)))
      message(indent, '  matrix_compress: ', ifelse(!is.null(matrix_control[['matrix_compress']]), matrix_control[['matrix_compress']], as.character(NA)))
      message(indent, '  matrix_buffer_size: ', ifelse(!is.null(matrix_control[['matrix_buffer_size']]), matrix_control[['matrix_buffer_size']], as.character(NA)))
      message(indent, '  matrix_bpcells_copy: ', ifelse(!is.null(matrix_control[['matrix_bpcells_copy']]), matrix_control[['matrix_bpcells_copy']], as.character(NA)))
    }
    else {
      stop('show_matrix_control: unsupported matrix class/mode/...\'', matrix_control[['method']], '\'')
    }
  }
  else {
    stop('show_matrix_control: unsupported matrix class/mode/...\'', matrix_control[['method']], '\'')
  }
}


# Push the directory path of the BPCells on-disk matrix onto the path stack. This is used
# to remove the directories when R is exited.
push_matrix_path <- function(mat) {
  matrix_info <- get_matrix_info(mat=mat)
  if(matrix_info[['matrix_class']] == 'BPCells' &&
     matrix_info[['matrix_mode']] == 'dir') {
    matrix_path <- matrix_info[['matrix_path']]
    x <- get_global_variable('monocle_gc_matrix_path')
    x <- append(x, matrix_path)
    set_global_variable('monocle_gc_matrix_path', x)
  }
}


#
# Return the 'base' BPCells matrix.
# Notes:
#   o uses
#      o get base matrix class
#      o get base matrix slots and their information, e.g., @type and @dir..
#
bpcells_find_base_matrix <- function(mat) {
  if(length(BPCells:::matrix_inputs(mat)) > 0) {
    return(bpcells_find_base_matrix(mat=BPCells:::matrix_inputs(mat)[[1]]))
  }
  return(mat)
}


#
# Known matrix types:
#   o  dense matrix: matrix
#   o  sparse matrix: CsparseMatrix, dgCMatrix, ...
#   o  BPCells matrix:
#        o  matrix_mode: 'dir'
#        o  matrix_type: float, double
#        o  matrix_compress: TRUE, FALSE
#        o  matrix_path
#        o  matrix_buffer_size
#
get_matrix_class <- function(mat) {

  nmatch <- 0
  matrix_info <- list()
  if(is(mat, 'matrix')) {
    matrix_info[['matrix_class']] <- 'r_dense_matrix'
    nmatch <- nmatch + 1
  }
  if(any(class(mat) %in% c('dgCMatrix'))) {
    matrix_info[['matrix_class']] <- 'dgCMatrix'
    nmatch <- nmatch + 1
  }
  if(any(class(mat) %in% c('lgCMatrix'))) {
    matrix_info[['matrix_class']] <- 'lgCMatrix'
    nmatch <- nmatch + 1
  }
  if(is(mat, 'dgTMatrix')) {
    matrix_info[['matrix_class']] <- 'dgTMatrix'
    nmatch <- nmatch + 1
  }
  if(is(mat, 'dgeMatrix')) {
    matrix_info[['matrix_class']] <- 'dgeMatrix'
    nmatch <- nmatch + 1
  }
  if(is(mat, 'IterableMatrix')) {
    matrix_info[['matrix_class']] <- 'BPCells'
    nmatch <- nmatch + 1
  }

  if(nmatch == 0) {
    stop('get_matrix_class: unrecognized matrix class')
  }
  if(nmatch > 1) {
    stop('get_matrix_class: ambiguous matrix class')
  }

  return(matrix_info)
}


# Get/infer the matrix information.
get_matrix_info <- function(mat) {
  matrix_info <- tryCatch(get_matrix_class(mat=mat),
                   error=function(c) {stop(paste0(trimws(c),
                                                  '\n* error in get_matrix_info')) })

  if(is.null(matrix_info[['matrix_class']])) {
    stop('get_matrix_info: unable to infer matrix_class')
  }

  if(matrix_info[['matrix_class']] != 'BPCells') {
    return(matrix_info)
  }

  bmat <- bpcells_find_base_matrix(mat=mat)

  # In-memory iterable matrix object classes.
  #   UnpackedMatrixMem_uint32_t
  #   UnpackedMatrixMem_float
  #   UnpackedMatrixMem_double
  #   PackedMatrixMem_uint32_t
  #   PackedMatrixMem_float
  #   PackedMatrixMem_double
  #
  # On-disk iterable matrix object classes and relevant slots.
  #   MatrixDir: slots: dir, compressed, buffer_size, type
  #     compressed: logical
  #     type: character: 'uint32_t', 'float', 'double'
  #     buffer_size: int
  #     dir: character: path
  if(!(class(bmat) %in% c('UnpackedMatrixMem_uint32_t', 'UnpackedMatrixMem_float',
                          'UnpackedMatrixMem_double', 'PackedMatrixMem_uint32_t',
                          'PackedMatrixMem_float', 'PackedMatrixMem_double',
                          'MatrixDir', 'Iterable_dgCMatrix_wrapper'))) {
    stop('get_matrix_info: unrecognized BPCells matrix class \"', class(mat), '\"')
    return(NULL)
  }

  matrix_info[['matrix_class']] <- 'BPCells'

  if(class(bmat) == 'Iterable_dgCMatrix_wrapper') {
    matrix_info[['matrix_mode']] <- 'dgCMatrix'
  }
  else
  if(class(bmat) == 'UnpackedMatrixMem_uint32_t') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'uint32_t'
    matrix_info[['matrix_compress']] <- FALSE
  }
  else
  if(class(bmat) == 'UnpackedMatrixMem_float') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'float'
    matrix_info[['matrix_compress']] <- FALSE
  } 
  else
  if(class(bmat) == 'UnpackedMatrixMem_double') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'double'
    matrix_info[['matrix_compress']] <- FALSE
  } 
  else
  if(class(bmat) == 'PackedMatrixMem_uint32_t') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'uint32_t'
    matrix_info[['matrix_compress']] <- TRUE
  } 
  else
  if(class(bmat) == 'PackedMatrixMem_float') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'float'
    matrix_info[['matrix_compress']] <- TRUE
  } 
  else
  if(class(bmat) == 'PackedMatrixMem_double') {
    matrix_info[['matrix_mode']] <- 'mem'
    matrix_info[['matrix_type']] <- 'double'
    matrix_info[['matrix_compress']] <- TRUE
  } 
  else
  if(class(bmat) == 'MatrixDir') {
    matrix_info[['matrix_mode']] <- 'dir'
    matrix_info[['matrix_type']] <- bmat@type
    matrix_info[['matrix_compress']] <- bmat@compressed
    matrix_info[['matrix_buffer_size']] <- bmat@buffer_size
    matrix_info[['matrix_path']] <- bmat@dir
  }

  return(matrix_info)
}


# Show the matrix information.
show_matrix_info <- function(matrix_info, indent='') {
  message('matrix_info:')
  if(!is.null(matrix_info[['matrix_class']])) {
    message(paste0(indent, 'class:       ', matrix_info[['matrix_class']]))
  }
  if(!is.null(matrix_info[['matrix_mode']])) { 
    message(paste0(indent, 'mode:        ', matrix_info[['matrix_mode']]))
  }
  if(!is.null(matrix_info[['matrix_type']])) { 
    message(paste0(indent, 'type:        ', matrix_info[['matrix_type']]))
  }
  if(!is.null(matrix_info[['matrix_path']])) { 
    message(paste0(indent, 'path:        ', matrix_info[['matrix_path']]))
  }
  if(!is.null(matrix_info[['matrix_compress']])) { 
    message(paste0(indent, 'compress:    ', matrix_info[['matrix_compress']]))
  }
  if(!is.null(matrix_info[['matrix_buffer_size']])) { 
    message(paste0(indent, 'buffer_size: ', matrix_info[['matrix_buffer_size']]))
  }
}


# Compare two matrix control lists. Return TRUE if they are the same. There is
# a minor complication. If the matrix_class in both lists is BPCells, the
# matrix storage mode is 'dir, and compare_matrix_path_flag is FALSE, the
# matrix_path values may differ and compare_matrix_control still returns TRUE.
# Otherwise, compare_matrix_control returns FALSE.
compare_matrix_control <- function(matrix_control=list(), matrix_info=list(), compare_matrix_path_flag=FALSE) {
  if(matrix_control[['matrix_class']] == 'dgCMatrix' && matrix_info[['matrix_class']] == 'dgCMatrix') {
    return(TRUE)
  }
  else
  if(matrix_control[['matrix_class']] == 'BPCells' && matrix_info[['matrix_class']] == 'BPCells') {
    if(matrix_control[['matrix_mode']] == 'mem' && matrix_info[['matrix_mode']] == 'mem') {
      if(matrix_control[['matrix_type']] == matrix_info[['matrix_type']] &&
         matrix_control[['matrix_compress']] == matrix_info[['matrix_compress']]) {
        return(TRUE)
      }
    }
    else
    if(matrix_control[['matrix_mode']] == 'dir' && matrix_info[['matrix_mode']] == 'dir') {
      if(matrix_control[['matrix_type']] == matrix_info[['matrix_type']] &&
         matrix_control[['matrix_compress']] == matrix_info[['matrix_compress']] &&
         matrix_control[['matrix_buffer_size']] == matrix_info[['matrix_buffer_size']]) {
        if(compare_matrix_path_flag == FALSE ||
            matrix_control[['matrix_path']] == matrix_info[['matrix_path']]) {
          return(TRUE)
        }
        else {
          return(FALSE)
        }
      }
    }
  }

  return(FALSE)
}


set_matrix_citation <- function(cds) {
  matrix_class <- get_matrix_class(counts(cds))
  if(matrix_class == 'BPCells') {
    cds <- add_citation(cds, 'bpcells')
  }
  cds
}


# set_matrix_class
#  Notes:
#    o  set_matrix_class is meant to be the function used
#       nearly exclusively for making BPCells matrices. The
#       exceptions are load_monocle_objects and save_monocle_objects
#       which use BPCells::write_matrix_dir() 'directly'.
#    o  Cast an input matrix into a class given or inferred from the
#       matrix_control, which must be complete in the sense that all
#       values required for the class are given explicitly.
#    o  When the matrix_info values of the input matrix are the same
#       as the matrix_control values AND the matrix_class is BPCells,
#       the queued operations are not applied (without copying the
#       on-disk matrix) when matrix_control[['matrix_bpcells_copy']] is
#       FALSE. Otherwise, the queued BPCells operations are applied
#       and the result is stored in a new on-disk directory. Applying
#       the queued BPCells operations improves performance.
#   
set_matrix_class <- function(mat, matrix_control=list()) {
  # Check matrix_control list.
  check_matrix_control(matrix_control=matrix_control, control_type='unrestricted', check_conditional=TRUE)

  # Get input matrix info.
  matrix_info <- get_matrix_info(mat=mat)

# message('set_matrix_class: matrix_info: in:')
# show_matrix_info(matrix_info, indent='  ')
# message('')
# message('set_matrix_class: matrix_control in:')
# show_matrix_control(matrix_control)
# message('')

  if(matrix_info[['matrix_class']] == 'dgTMatrix') {
    mat <- as(mat, 'CsparseMatrix')
    matrix_info[['matrix_class']] <- 'dgCMatrix'
  }

  mat_out <- mat

  # Can we return the input matrix object immediately? We
  # can when the matrix_info and matrix_control are the same
  # (except for the matrix_path) AND the matrix_class is
  # dgCMatrix OR (the matrix_class is 'BPCells' AND
  # matrix_bpcells_copy is FALSE.
  if(compare_matrix_control(matrix_control, matrix_info, compare_matrix_path_flag=FALSE)) {
    if(matrix_control[['matrix_class']] == 'dgCMatrix' ||
       (matrix_control[['matrix_class']] == 'BPCells' &&
        matrix_control[['matrix_bpcells_copy']] == FALSE)) {
      return(mat_out)
    }
  }

  if(matrix_control[['matrix_class']] == 'dgCMatrix') {
    if(matrix_info[['matrix_class']] != 'dgCMatrix') {
      if(matrix_info[['matrix_class']] == 'BPCells') {
        mat_out <- as(mat, 'dgCMatrix')
      }
      else {
        mat_out <- as(mat, 'CsparseMatrix')
      }
    }
  }
  else
  if(matrix_control[['matrix_class']] == 'BPCells') {

    matrix_class_d <- matrix_control[['matrix_class']]
    matrix_mode_d <- matrix_control[['matrix_mode']]
    matrix_type_d <- matrix_control[['matrix_type']]
    matrix_compress_d <- matrix_control[['matrix_compress']]
    matrix_path_d <- matrix_control[['matrix_path']]
    matrix_buffer_size_d <- matrix_control[['matrix_buffer_size']]

    if(matrix_info[['matrix_class']] == 'dgCMatrix') {
      if(matrix_mode_d == 'mem') {
        mat_out <- BPCells::write_matrix_memory(mat=BPCells::convert_matrix_type(mat, type=matrix_type_d),
                                                compress=matrix_compress_d)
      }
      else
      if(matrix_mode_d == 'dir') {
        mat_dir <- tempfile(pattern=paste0('monocle.bpcells.',
                                           format(Sys.Date(), format='%Y%m%d'), '.'),
                            tmpdir=matrix_path_d,
                            fileext='.tmp')[[1]]
        mat_out <- BPCells::write_matrix_dir(mat=BPCells::convert_matrix_type(mat, type=matrix_type_d),
                                             dir=mat_dir,
                                             compress=matrix_compress_d,
                                             buffer_size=matrix_buffer_size_d,
                                             overwrite=FALSE)
        push_matrix_path(mat=mat_out)
      }
      else {
        stop('set_matrix_class: unrecognized matrix_info[[\'matrix_mode\']]')
      }
    }
    else
    if(matrix_info[['matrix_class']] == 'BPCells') {
      if(matrix_mode_d == 'mem') {
        mat_out <- BPCells::write_matrix_memory(mat=BPCells::convert_matrix_type(mat, type=matrix_type_d),
                                                compress=matrix_compress_d)
      }
      else
      if(matrix_mode_d == 'dir') {
        mat_dir <- tempfile(pattern=paste0('monocle.bpcells.', 
                                           format(Sys.Date(), 
                                                  format='%Y%m%d'), '.'), 
                            tmpdir=matrix_path_d,
                            fileext='.tmp')[[1]]
        mat_out <- BPCells::write_matrix_dir(mat=BPCells::convert_matrix_type(mat, type=matrix_type_d),
                                             dir=mat_dir, 
                                             compress=matrix_compress_d, 
                                             buffer_size=matrix_buffer_size_d, 
                                             overwrite=FALSE)
        push_matrix_path(mat=mat_out)
      }
    }
  }
  else {
    stop('set_matrix_class: unrecognized matrix_info[[\'matrix_class\']]: ', matrix_info[['matrix_class']])
  }

#message('set_matrix_class: matrix_info: out:')
#show_matrix_info(get_matrix_info(mat=mat_out), indent='  ')

  return(mat_out)
}


# Remove the BPCells matrix directory and the matrix R object.
rm_bpcells_dir <- function(mat) {
  mat_info <- get_matrix_info(mat=mat)
  if(mat_info[['matrix_class']] == 'BPCells' &&
     mat_info[['matrix_mode']] == 'dir') {
    unlink(mat_info[['matrix_path']], recursive=TRUE)
    rm(mat)
  }
}


#
# Set row-major order counts matrix in cds with BPCells counts matrix.
#

#' Set the row-major order counts matrix in the assays slot of the CDS when the
#'   CDS has a BPCells counts matrix.
#'
#' @description
#' By default, BPCells stores matrices as one-dimensional vectors in column-major order, as does R.
#' As a result, column access is fast and row access is slow. We use BPCell's ability to also store and
#' access matrices in row-major order, which gives fast row access. The function set_cds_row_order_matrix
#' creates the row-major order matrix and stores it in the assays slot of the CDS with the name
#' "counts_row_order". The two copies of the counts matrix must have the same count values so if you
#' replace or change the CDS's counts matrix, you must also update the `counts_row_order` matrix, which
#' you can do using this function set_cds_row_order_matrix.
#' @param cds cell_data_set A cell_data_set.
#' @return cell_data_set The cell_data_set with the additional row-major order
#'    counts matrix.
#' @examples
#'    cds <- load_a549(matrix_control=list(matrix_class='BPCells'))
#'    cds <- set_cds_row_order_matrix(cds)
#'    str(cds)
#'
#' @export
set_cds_row_order_matrix <- function(cds) {

  mat_c <- counts(cds)
  if(!is(mat_c, 'IterableMatrix')) {
    return(cds)
  }

  matrix_info <- get_matrix_info(mat=mat_c)
  if(matrix_info[['matrix_mode']] == 'dir') {
    bmat <- bpcells_find_base_matrix(mat=mat_c)
    matrix_path <- dirname(bmat@dir)
  }
  else {
    matrix_path <- '.'
  }

  # Remove existing counts_row_order matrix and directory.
  if(!is.null(assays(cds)[['counts_row_order']])) {
    rm_bpcells_dir(mat=assays(cds)[['counts_row_order']])
    assays(cds)[['counts_row_order']] <- NULL
  }

  # Make a BPCells count matrix in row major order.
  outdir <- tempfile(pattern=paste0('monocle.bpcells.',
                                    format(Sys.Date(), format='%Y%m%d'), '.'),
                     tmpdir=matrix_path,
                     fileext='_r.tmp')[[1]]

  tmpdir <- tempfile(pattern='monocle.transpose_bpc.',
                     tmpdir=matrix_path,
                     '.tmp')[[1]]
  # Make 'normalized paths
  outdir <- normalizePath(outdir, mustWork=FALSE)
  tmpdir <- normalizePath(tmpdir, mustWork=FALSE)

  # I see no option for choosing compressed matrix and transpose_storage_order appears to
  # compress. This is not a big deal because only the indices are compressed.
  mat_r <- tryCatch(
             BPCells::transpose_storage_order(matrix=mat_c, outdir=outdir, tmpdir=tmpdir, load_bytes=4194304L, sort_bytes=1073741824L),
             error=function(c) {stop(paste0(trimws(c),
                                            '\n  error running BPCells::transpose_storage_order',
                                            '\n', dbar40,
                                            '\n', report_path_status(out_dir, dirname(tmpdir)),
                                            '\n', dbar40,
                                            '\n* error in set_cds_row_order_matrix')) })

  unlink(tmpdir, recursive=TRUE)
  push_matrix_path(mat=mat_r)

  assay(cds, 'counts_row_order') <- mat_r

  return(cds)
}


# Test (partially) that the BPCells counts() and
# counts_row_order() matrices are consistent.
# The test checks that the two matrices exist and
# that row sums are the same for the two.
check_bpcells_counts_matrix_pair <- function(cds) {
  if(!is(assays(cds)[['counts']], 'IterableMatrix')) {
    message('Error: the cds does not have a BPCells counts matrix.')
    return(FALSE)
  }

  if(!is(assays(cds)[['counts_row_order']], 'IterableMatrix')) {
    message('Error: the cds does not have a BPCells counts_row_order matrix.')
    return(FALSE)
  }

  counts_rowsums <- BPCells::rowSums(counts(cds))
  counts_row_order_rowsums <- BPCells::rowSums(assays(cds)[['counts_row_order']])

  if(length(counts_rowsums) != length(counts_row_order_rowsums)) {
    message('Error: the cds counts and counts_row_order matrix row sums have different lengths')
    return(FALSE)
  }

  if(any(counts_rowsums != counts_row_order_rowsums)) {
    message('Error: the cds counts and counts_row_order matrix row sums have different values')
    return(FALSE)
  }

  return(TRUE)
}


#' Convert the counts matrix class in the given CDS.
#'
#' @description Converts the counts matrix that is in the
#'   given CDS to the matrix_class specified in the
#'   matrix_control list.
#' @param cds cell_data_set The cell_data_set that has the
#'   counts matrix to be converted.
#' @param matrix_control list A list of matrix control
#'   values used to convert the counts matrix. If the
#'   counts matrix in the cds is the same as the desired
#'   counts matrix, it is not altered. matrix_control is
#'   required.
#' @return cell_data_set The cell_data_set with the converted
#'   counts matrix.
#' @examples
#'    cds <- load_a549()
#'    str(counts(cds))
#'    cds <- convert_counts_matrix(cds, matrix_control=list(matrix_class='BPCells'))
#'    str(counts(cds))
#'
#' @export
convert_counts_matrix <- function(cds, matrix_control=list()) {
  assertthat::assert_that(is.list(matrix_control),
                          msg = 'convert_counts_matrix: invalid matrix_control parameter')

  matrix_control_default <- tryCatch(set_matrix_control_default(matrix_control=matrix_control, control_type='unrestricted'),
                              error = function(c) { stop(paste0(trimws(c), '\n* error in convert_counts_matrix')) })

  matrix_control_res <- set_matrix_control(matrix_control=matrix_control, matrix_control_default=matrix_control_default, control_type='unrestricted')

  # Do not make a BPCells matrix on-disk copy if the
  # matrix_info values of the counts matrix are the
  # same as the matrix_control values.
  matrix_control_res[['matrix_bpcells_copy']] <- FALSE

  mat <- counts(cds)

  counts(cds, bpcells_warn=FALSE) <- set_matrix_class(mat=mat, matrix_control=matrix_control_res)
  cds <- set_matrix_citation(cds)

  if(matrix_control_res[['matrix_class']] == 'BPCells') {
    push_matrix_path(mat=mat)
    cds <- set_cds_row_order_matrix(cds=cds)
  }

  return(cds)
}
cole-trapnell-lab/monocle3 documentation built on June 11, 2025, 11:22 p.m.