R/22-SparseDC.R

Defines functions SparseDC_simulation SparseDC_estimation

Documented in SparseDC_estimation SparseDC_simulation

#' Estimate Parameters From Real Datasets by SparseDC
#'
#' This function is used to estimate useful parameters from a real dataset by
#' using `sparseDCEstimate` function in Splatter package.
#'
#' @param ref_data A count matrix. Each row represents a gene and each column
#' represents a cell.
#' @param verbose Logical.
#' @param seed An integer of a random seed.
#' @param other_prior A list with names of certain parameters. Some methods need
#' extra parameters to execute the estimation step, so you must input them. In
#' simulation step, the number of cells, genes, groups, batches, the percent of
#' DEGs are usually customed, so before simulating a dataset you must point it out.
#' See `Details` below for more information.
#' @importFrom splatter sparseDCEstimate newSparseDCParams
#' @return A list contains the estimated parameters and the results of execution
#' detection.
#' @export
#' @details
#' In SparseDC, users must input cell group information to estimate parameters from
#' real data. But SparseDC may fail to estimate parameters due to the unsuitable
#' distribution fitting and users can set `nclusters` parameter to try again.
#'
#' For more information, see `Examples` and [splatter::sparseDCEstimate()]
#'
#' @references
#' Barron M, Zhang S, Li J. A sparse differential clustering algorithm for tracing cell type changes via single-cell RNA-sequencing data. Nucleic acids research, 2018, 46(3): e14-e14. <https://doi.org/10.1093/nar/gkx1113>
#'
#' CRAN URL: <https://cran.rstudio.com/web/packages/SparseDC/index.html>
#'
#' @examples
#' \dontrun{
#' ref_data <- SingleCellExperiment::counts(scater::mockSCE())
#' ## cell groups
#' set.seed(111)
#' group_condition <- sample(1:2, ncol(ref_data), replace = TRUE)
#' ## estimation
#' estimate_result <- simmethods::SparseDC_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(group.condition = group_condition),
#'   verbose = TRUE,
#'   seed = 111
#' )
#' ## Note that SparseDC defines 2 clusters present in the dataset by default. Users
#' ## can input other number if the estimation step failed.
#' estimate_result <- simmethods::SparseDC_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(group.condition = group_condition,
#'                      nclusters = 3),
#'   verbose = TRUE,
#'   seed = 111
#' )
#' }
#'
SparseDC_estimation <- function(ref_data,
                                verbose = FALSE,
                                other_prior,
                                seed
){
  ##############################################################################
  ####                               Check                                   ###
  ##############################################################################
  if(!is.matrix(ref_data)){
    ref_data <- as.matrix(ref_data)
  }
  other_prior[["counts"]] <- ref_data
  if(is.null(other_prior[["group.condition"]])){
    stop("Please input the conditions that each cell belongs to")
  }
  other_prior[["conditions"]] <- other_prior[["group.condition"]]
  if(is.null(other_prior[["nclusters"]])){
    other_prior[["nclusters"]] <- 2
  }
  other_prior[["params"]] <- splatter::newSparseDCParams()
  estimate_formals <- simutils::change_parameters(function_expr = "splatter::sparseDCEstimate",
                                                  other_prior = other_prior,
                                                  step = "estimation")
  ##############################################################################
  ####                            Estimation                                 ###
  ##############################################################################
  if(verbose){
    message("Estimating parameters using SparseDC")
  }
  # Seed
  set.seed(seed)
  # Estimation
  estimate_detection <- peakRAM::peakRAM(
    estimate_result <- splatter::sparseDCEstimate(counts = estimate_formals[["counts"]],
                                                  conditions = estimate_formals[["conditions"]],
                                                  nclusters = estimate_formals[["nclusters"]],
                                                  norm = estimate_formals[["norm"]],
                                                  params = estimate_formals[["params"]])
  )
  ##############################################################################
  ####                           Ouput                                       ###
  ##############################################################################
  estimate_output <- list(estimate_result = estimate_result,
                          estimate_detection = estimate_detection)
  return(estimate_output)
}



#' Simulate Datasets by SparseDC
#'
#' This function is used to simulate datasets from learned parameters by `sparseDCSimulate`
#' function in Splatter package.
#'
#' @param parameters A object generated by [splatter::sparseDCEstimate()]
#' @param other_prior A list with names of certain parameters. Some methods need
#' extra parameters to execute the estimation step, so you must input them. In
#' simulation step, the number of cells, genes, groups, batches, the percent of
#' DEGs are usually customed, so before simulating a dataset you must point it out.
#' See `Details` below for more information.
#' @param return_format A character. Alternative choices: list, SingleCellExperiment,
#' Seurat, h5ad. If you select `h5ad`, you will get a path where the .h5ad file saves to.
#' @param verbose Logical. Whether to return messages or not.
#' @param seed A random seed.
#' @importFrom splatter getParams setParam sparseDCSimulate
#' @importFrom stringr str_replace
#' @export
#' @details
#' In SparseDC, users can only set `nCells` and `nGenes` to specify the number of
#' cells and genes. But note that the total cell number is equal to `nCells` multiplies
#' `nclusters` in estimation step that users defined (`nclusters` is 2 by default).
#'
#' For more unusually used parameters and instructions, see `Examples` and [splatter::SparseDCParams()]
#'
#' @references
#' Barron M, Zhang S, Li J. A sparse differential clustering algorithm for tracing cell type changes via single-cell RNA-sequencing data. Nucleic acids research, 2018, 46(3): e14-e14. <https://doi.org/10.1093/nar/gkx1113>
#'
#' CRAN URL: <https://cran.rstudio.com/web/packages/SparseDC/index.html>
#'
#' @examples
#' \dontrun{
#' ref_data <- SingleCellExperiment::counts(scater::mockSCE())
#' ## cell groups
#' set.seed(111)
#' group_condition <- sample(1:2, ncol(ref_data), replace = TRUE)
#' ## estimation
#' estimate_result <- simmethods::SparseDC_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(group.condition = group_condition),
#'   verbose = TRUE,
#'   seed = 111
#' )
#' ## Note that SparseDC defines 2 clusters present in the dataset by default. Users
#' ## can input other number if the estimation step failed.
#' estimate_result <- simmethods::SparseDC_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(group.condition = group_condition,
#'                      nclusters = 3),
#'   verbose = TRUE,
#'   seed = 111
#' )
#'
#' # 1) Simulate with default parameters
#' simulate_result <- simmethods::SparseDC_simulation(
#'   parameters = estimate_result[["estimate_result"]],
#'   other_prior = NULL,
#'   return_format = "list",
#'   verbose = TRUE,
#'   seed = 111
#' )
#' ## counts
#' counts <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(counts)
#'
#' # 2) Simulate 1000 cells and 2000 genes
#' ## Note that SparseDC defines 2 clusters present in the dataset by default. So we
#' ## just only set nCells = 500.
#' length(estimate_result[["estimate_result"]]@clusts.c1)
#' simulate_result <- simmethods::SparseDC_simulation(
#'   parameters = estimate_result[["estimate_result"]],
#'   other_prior = list(nCells = 500,
#'                      nGenes = 2000),
#'   return_format = "list",
#'   verbose = TRUE,
#'   seed = 111
#' )
#'
#' ## counts
#' counts <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(counts)
#' }
#'
SparseDC_simulation <- function(parameters,
                                other_prior = NULL,
                                return_format,
                                verbose = FALSE,
                                seed
){
  ##############################################################################
  ####                               Check                                   ###
  ##############################################################################
  assertthat::assert_that(class(parameters) == "SparseDCParams")
  if(!is.null(other_prior)){
    parameters <- simutils::set_parameters(parameters = parameters,
                                           other_prior = other_prior,
                                           method = "SparseDC")
  }
  ## nCells
  if(!is.null(other_prior[["nCells"]])){
    parameters <- splatter::setParam(parameters, "nCells", other_prior[["nCells"]])
  }
  ## nGenes
  if(!is.null(other_prior[["nGenes"]])){
    parameters <- splatter::setParam(parameters, "nGenes", other_prior[["nGenes"]])
  }
  # Get params to check
  params_check <- splatter::getParams(parameters, c("nCells",
                                                    "nGenes"))
  # Return to users
  message(paste0("nCells: ", params_check[['nCells']] * length(parameters@clusts.c1)))
  message(paste0("nGenes: ", params_check[['nGenes']]))
  ##############################################################################
  ####                            Simulation                                 ###
  ##############################################################################
  if(verbose){
    message("Simulating datasets using SparseDC\n")
  }
  # Seed
  parameters <- splatter::setParam(parameters, name = "seed", value = seed)
  # Simulation
  simulate_detection <- peakRAM::peakRAM(
    simulate_result <- splatter::sparseDCSimulate(parameters, verbose = verbose)
  )
  ##############################################################################
  ####                        Format Conversion                              ###
  ##############################################################################
  # counts
  counts <- as.matrix(SingleCellExperiment::counts(simulate_result))
  # col_data
  col_info <- SingleCellExperiment::colData(simulate_result)
  col_data <- data.frame("cell_name" = colnames(counts),
                         "group" = stringr::str_replace_all(as.character(col_info$Condition),
                                                            "Condition", "Group"))
  rownames(col_data) <- col_data$cell_name
  # row_data
  row_data <- data.frame("gene_name" = rownames(counts))
  rownames(row_data) <- row_data$gene_name
  # Establish SingleCellExperiment
  simulate_result <- SingleCellExperiment::SingleCellExperiment(list(counts = counts),
                                                                colData = col_data,
                                                                rowData = row_data)
  simulate_result <- simutils::data_conversion(SCE_object = simulate_result,
                                               return_format = return_format)

  ##############################################################################
  ####                           Ouput                                       ###
  ##############################################################################
  simulate_output <- list(simulate_result = simulate_result,
                          simulate_detection = simulate_detection)
  return(simulate_output)
}
duohongrui/simmethods documentation built on June 17, 2024, 10:49 a.m.