R/08-powsimR.R

Defines functions powsimR_simulation powsimR_estimation

Documented in powsimR_estimation powsimR_simulation

#' Estimate Parameters From Real Datasets by powsimR
#'
#' This function is used to estimate useful parameters from a real dataset by
#' using `estimateParam` or `estimateSpike` function in powsimR package.
#'
#' @param ref_data A count matrix. Each row represents a gene and each column
#' represents a cell.
#' @param verbose Logical.
#' @param other_prior A list with names of certain parameters. Some methods need
#' extra parameters to execute the estimation step, so you must input them. In
#' simulation step, the number of cells, genes, groups, batches, the percent of
#' DEGs are usually customed, so before simulating a dataset you must point it out.
#' See `Details` below for more information.
#' @param seed An integer of a random seed.
#' @details
#' powsimR provides some choices for users to select suitable parameters according
#' to different types of data, platforms, normalization methods, distributions and
#' so on.
#' 1. RNAseq. "bulk" or "singlecell" (default).
#' 2. Protocol. Options are "UMI" (default) (e.g. 10X Genomics, CEL-seq2) or "Read" (e.g. Smart-seq2).
#' 3. Distribution. "NB" (default) for negative binomial or "ZINB" for zero-inflated negative binomial distribution fitting.
#' 4. Normalisation. "TMM" (default), "MR", "PosCounts", "UQ", "scran", "Linnorm", "SCnorm", "Census", "depth", "none".
#'
#' powsimR also provides an another choice to estimate parameters (not neccessary)
#' via spike-ins. If users want to use this, make sure that the reference data
#' must contain ERCC spike-in counts. In addtion, users must set dilution.factor and
#' volume information by `other_prior = list(dilution.factor = xxx, volume = xxx)`.
#' For more instructions, see `Examples`.
#' @return A list contains the estimated parameters and the results of execution
#' detection.
#' @export
#' @references
#' Vieth B, Ziegenhain C, Parekh S, et al. powsimR: power analysis for bulk and single cell RNA-seq experiments. Bioinformatics, 2017, 33(21): 3486-3488. <https://doi.org/10.1093/bioinformatics/btx435>
#'
#' Github URL: <https://github.com/bvieth/powsimR>
#'
#' @examples
#' \dontrun{
#' ref_data <- simmethods::data
#'
#' ## Estimate parameters without ERCC spike-in
#' estimate_result <- powsimR_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(RNAseq = "singlecell",
#'                      Protocol = "UMI",
#'                      Normalisation = "scran"),
#'   verbose = TRUE,
#'   seed = 111)
#'
#' ## Estimate parameters with ERCC spike-in
#' ### Make sure there are ERCC names in reference data
#' rownames(ref_data)[grep(rownames(ref_data), pattern = "^ERCC")]
#' ### Users must input the dilution.factor and volume (microliter) to determine the ERCC molecules
#' estimate_result <- powsimR_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(RNAseq = "singlecell",
#'                      Protocol = "UMI",
#'                      Normalisation = "scran",
#'                      dilution.factor = 50000,
#'                      volume = 1),
#'   verbose = TRUE,
#'   seed = 111)
#' }
#'
powsimR_estimation <- function(ref_data,
                               verbose = FALSE,
                               other_prior = NULL,
                               seed){

  ##############################################################################
  ####                            Environment                                ###
  ##############################################################################
  if(!requireNamespace("powsimR", quietly = TRUE)){
    message("powsimR is not installed on your device...")
    message("Installing powsimR...")
    devtools::install_github("bvieth/powsimR", build_vignettes = FALSE, dependencies = TRUE)
  }
  ##############################################################################
  ####                               Check                                   ###
  ##############################################################################
  if(!is.matrix(ref_data)){
    ref_data <- as.matrix(ref_data)
  }
  # Add ref_data to other_prior
  other_prior[["countData"]] <- ref_data
  other_prior[["verbose"]] <- verbose
  # Check
  if(is.null(other_prior[["RNAseq"]])){
    other_prior[["RNAseq"]] <- "singlecell"
  }
  if(is.null(other_prior[["Protocol"]])){
    other_prior[["Protocol"]] <- "UMI"
  }
  # Sub function
  if(is.null(other_prior[["Distribution"]])){
    other_prior[["Distribution"]] <- "NB"
  }
  if(is.null(other_prior[["Normalisation"]])){
    other_prior[["Normalisation"]] <- "TMM"
  }

  ## Spike-in
  if(!is.null(other_prior[["volume"]]) & !is.null(other_prior[["dilution.factor"]])){
    spikeData <- ref_data[grep(rownames(ref_data), pattern = "^ERCC"), ]
    concentration <- simmethods::ERCC_info$con_Mix1_attomoles_ul
    spikeInfo <- data.frame(SpikeID = simmethods::ERCC_info$ERCC_id,
                            SpikeInput = concentration*10^-18*6.022*10^23*other_prior[["volume"]]/other_prior[["dilution.factor"]],
                            row.names = simmethods::ERCC_info$ERCC_id)
    spikeInfo <- spikeInfo[rownames(spikeData), ]
    other_prior[["spikeData"]] <- spikeData
    other_prior[["spikeInfo"]] <- spikeInfo
    spike_in_formals <- simutils::change_parameters(function_expr = "powsimR::estimateSpike",
                                                    other_prior = other_prior,
                                                    step = "estimation")
    spike_in_formals$Normalisation <- "depth"
  }else spike_in_formals <- NULL
  estimate_formals <- simutils::change_parameters(function_expr = "powsimR::estimateParam",
                                                  other_prior = other_prior,
                                                  step = "estimation")

  ##############################################################################
  ####                            Estimation                                 ###
  ##############################################################################
  if(verbose){
    message("Estimating parameters using powsimR")
  }
  # Seed
  set.seed(seed)
  # Estimation
  # Estimate parameters from real data and return parameters and detection results
  cat("Estimating parameters using estimateParam function\n")
  estimate_detection <- peakRAM::peakRAM(
    estimate_result <- do.call(powsimR::estimateParam, estimate_formals)
  )
  if(!is.null(spike_in_formals)){
    estSpikeRes <- do.call(powsimR::estimateSpike, spike_in_formals)
  }else estSpikeRes <- NULL
  estimate_result <- list(estimate_result = estimate_result,
                          estSpikeRes = estSpikeRes)
  ##############################################################################
  ####                           Ouput                                       ###
  ##############################################################################
  estimate_output <- list(estimate_result = estimate_result,
                          estimate_detection = estimate_detection)
  return(estimate_output)
}


#' Simulate Datasets by powsimR
#'
#' This function is used to simulate datasets from learned parameters by `simulateDE`
#' function in powsimR package.
#'
#' @param parameters A object generated by [powsimR::estimateParam()] or [powsimR::estimateSpike()].
#' @param other_prior A list with names of certain parameters. Some methods need
#' extra parameters to execute the estimation step, so you must input them. In
#' simulation step, the number of cells, genes, groups, batches, the percent of
#' DEGs are usually customed, so before simulating a dataset you must point it out.
#' See `Details` below for more information.
#' @param return_format A character. Alternative choices: list, SingleCellExperiment,
#' Seurat, h5ad. If you select `h5ad`, you will get a path where the .h5ad file saves to.
#' @param verbose Logical. Whether to return messages or not.
#' @param seed A random seed.
#' @importFrom simutils proportionate
#' @importFrom BiocGenerics cbind
#' @export
#' @details
#' powsimR provides some choices for users to select suitable parameters according
#' to different normalization methods, and methods for differential expressed analysis.
#' 1. Normalisation. "TMM" (default), "MR", "PosCounts", "UQ", "scran", "Linnorm", "sctransform", "SCnorm", "Census", "depth".
#' 2. DEmethod. "T-Test", "edgeR-LRT", "edgeR-QL", "edgeR-zingeR", "edgeR-ZINB-WaVE", "limma-voom", "limma-trend" (default), "DESeq2", "DESeq2-zingeR", "DESeq2-ZINB-WaVE", "ROTS", "baySeq", "NOISeq", "EBSeq", "MAST", "BPSC", "scDD", "DECENT".
#'
#' In addtion to simulate datasets with default parameters, users want to simulate
#' other kinds of datasets, e.g. a counts matrix with 2 or more cell groups. In
#' powsimR, you can set extra parameters to simulate datasets.
#'
#' The customed parameters you can set are below:
#' 1. nCells. In powsimR, You can directly set `other_prior = list(nCells = 5000)` to simulate 5000 cells.
#' 2. nGenes. You can directly set `other_prior = list(nGenes = 5000)` to simulate 5000 genes.
#' 3. de.prob. You can directly set `other_prior = list(de.prob = 0.2)` to simulate DEGs that account for 20 percent of all genes.
#' 4. prob.group. You can directly set `other_prior = list(prob.group = c(0.4, 0.6))` to assign two proportions of cell groups. Note that the the length of the vector must be **2**.
#' 5. fc.group. You can directly set `other_prior = list(fc.group = 2)` to specify the fold change of DEGs.
#' 6. nBatches. You can not directly set `other_prior = list(nBatches = 3)` to simulate 3 batches. Instead, you should set `other_prior = list(prob.batch = c(0.3, 0.4, 0.3))` to reach the goal.
#' 7. fc.batch. You can directly set `other_prior = list(fc.batch = 2)` to specify the fold change of genes between batches.
#'
#' For more customed parameters in powsimR, please check [powsimR::simulateDE()].
#' @references
#' Vieth B, Ziegenhain C, Parekh S, et al. powsimR: power analysis for bulk and single cell RNA-seq experiments. Bioinformatics, 2017, 33(21): 3486-3488. <https://doi.org/10.1093/bioinformatics/btx435>
#'
#' Github URL: <https://github.com/bvieth/powsimR>
#'
#' @examples
#' \dontrun{
#' ref_data <- simmethods::data
#'
#' ## Estimate parameters with ERCC spike-in
#' ### Make sure there are ERCC names in reference data
#' rownames(ref_data)[grep(rownames(ref_data), pattern = "^ERCC")]
#' ### Users must input the dilution.factor and volume (nanoliter) to determine the ERCC molecules
#' estimate_result <- powsimR_estimation(
#'   ref_data = ref_data,
#'   other_prior = list(RNAseq = "singlecell",
#'                      Protocol = "UMI",
#'                      Normalisation = "scran",
#'                      dilution.factor = 50000,
#'                      volume = 1),
#'   verbose = TRUE,
#'   seed = 111)
#'
#' # (1) Simulate 500 cells and 2000 genes
#' simulate_result <- simmethods::powsimR_simulation(
#'   parameters = estimate_result[["estimate_result"]],
#'   other_prior = list(nCells = 500,
#'                      nGenes = 2000),
#'   return_format = "list",
#'   verbose = TRUE,
#'   seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' ## In powsimR, users always get two groups of cells, and the numbers of cells in
#' ## different groups are the same (default, prob.group = c(0.5, 0.5)).
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' ## row_data
#' row_data <- simulate_result[["simulate_result"]][["row_meta"]]
#' table(row_data$de_gene)[2]/2000
#'
#' # (2) Simulate two groups with 20% of DEGs and 4 fold change.
#' simulate_result <- simmethods::powsimR_simulation(
#'   parameters = estimate_result[["estimate_result"]],
#'   other_prior = list(prob.group = c(0.3, 0.7),
#'                      de.prob = 0.2,
#'                      fc.group = 4),
#'   return_format = "list",
#'   verbose = TRUE,
#'   seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' ## cell information
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' ## gene information
#' row_data <- simulate_result[["simulate_result"]][["row_meta"]]
#' table(row_data$de_gene)[2]/4000
#'
#'
#' # (3) Simulate two batches (2 fold change)
#' simulate_result <- simmethods::powsimR_simulation(
#'   parameters = estimate_result[["estimate_result"]],
#'   other_prior = list(prob.batch = c(0.4, 0.6),
#'                      fc.batch = 2),
#'   return_format = "list",
#'   verbose = TRUE,
#'   seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' ## cell information
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' table(col_data$batch)
#' ## gene information
#' row_data <- simulate_result[["simulate_result"]][["row_meta"]]
#' head(row_data)
#' table(row_data$de_fc)
#' table(row_data$batch_genes)
#' }
#'
powsimR_simulation <- function(parameters,
                               other_prior = NULL,
                               return_format,
                               verbose = FALSE,
                               seed
){

  ##############################################################################
  ####                            Environment                                ###
  ##############################################################################
  if(!requireNamespace("powsimR", quietly = TRUE)){
    message("powsimR is not installed on your device...")
    message("Installing powsimR...")
    devtools::install_github("bvieth/powsimR", build_vignettes = FALSE, dependencies = TRUE)
  }
  ##############################################################################
  ####                               Check                                   ###
  ##############################################################################
  other_prior[["estParamRes"]] <- parameters$estimate_result
  other_prior[["setup.seed"]] <- seed
  other_prior[["verbose"]] <- verbose
  if(!is.null(parameters[["estSpikeRes"]])){
    other_prior[["estSpikeRes"]] <- parameters$estSpikeRes
  }
  ##############################################################################
  ####                                Setup                                  ###
  ##############################################################################
  ## Sim num
  other_prior[["nsims"]] <- 1
  ## genes
  if(is.null(other_prior[["nGenes"]])){
    other_prior[["ngenes"]] <- parameters$estimate_result[["totalG"]]
  }else{
    other_prior[["ngenes"]] <- other_prior[["nGenes"]]
  }
  ## cells
  ### prob.batch
  if(!is.null(other_prior[["prob.batch"]])){
    prob.batch <- other_prior[["prob.batch"]]
  }else{
    prob.batch <- 1
  }
  ### prob.group
  if(!is.null(other_prior[["prob.group"]])){
    prob.group <- other_prior[["prob.group"]]
    if(sum(prob.group) != 1){
      stop("The sum of prob.group is not euqal to 1")
    }
    if(length(prob.group) != 2){
      stop("The length of prob.group is not euqal to 2")
    }
  }else{
    prob.group <- c(0.5, 0.5)
  }
  ### Assign cells
  if(is.null(other_prior[["nCells"]])){
    nCells <- parameters$estimate_result[["totalS"]]
  }else{
    nCells <- other_prior[["nCells"]]
  }

  group1_cell <- round(nCells * prob.group[1])
  group2_cell <- nCells - group1_cell

  if(length(prob.batch) == 1){
    nbatches <- 1
    other_prior[["n1"]] <- group1_cell
    other_prior[["n2"]] <- group2_cell
  }else{
    nbatches <- length(prob.batch)
    other_prior[["n1"]] <- simutils::proportionate(number = group1_cell,
                                                   result_sum_strict = group1_cell,
                                                   prop = prob.batch,
                                                   prop_sum_strict = 1,
                                                   digits = 0)
    other_prior[["n2"]] <- simutils::proportionate(number = group2_cell,
                                                   result_sum_strict = group2_cell,
                                                   prop = prob.batch,
                                                   prop_sum_strict = 1,
                                                   digits = 0)
  }

  ## proportion of genes in groups
  if(is.null(other_prior[["de.prob"]])){
    other_prior[["p.DE"]] <- 0.1
  }else{
    other_prior[["p.DE"]] <- other_prior[["de.prob"]]
  }
  ## fold change of genes in groups
  if(is.null(other_prior[["fc.group"]])){
    other_prior[["pLFC"]] <- 1
  }else{
    other_prior[["pLFC"]] <- log2(other_prior[["fc.group"]])
  }
  ## fold change of genes in batches
  if(is.null(other_prior[["fc.batch"]])){
    if(nbatches == 1){
      other_prior[["bLFC"]] <- NULL
    }else{
      other_prior[["bLFC"]] <- 1
    }
  }else{
    other_prior[["bLFC"]] <- log2(other_prior[["fc.batch"]])
  }
  ## proportion of genes in batches
  if(nbatches == 1){
    other_prior[["p.B"]] <- NULL
  }else{
    if(is.null(other_prior[["p.B"]])){
      other_prior[["p.B"]] <- 0.1
    }
  }
  ## return to users
  message(paste0("nCells: ", nCells))
  message(paste0("nGenes: ", other_prior[['ngenes']]))
  message("nGroups: 2")
  message(paste0("de.prob: ", other_prior[['p.DE']]))
  message(paste0("fc.group: ", 2^other_prior[['pLFC']]))
  message(paste0("nBatches: ", nbatches))
  message(paste0("fc.batch: ", 2^other_prior[['bLFC']]))

  setup_formals <- simutils::change_parameters(function_expr = "powsimR::Setup",
                                               other_prior = other_prior,
                                               step = "simulation")

  SetupRes <- BiocGenerics::do.call(powsimR::Setup, setup_formals)
  ##############################################################################
  ####                            Simulation                                 ###
  ##############################################################################
  other_prior[["SetupRes"]] <- SetupRes
  other_prior[["Counts"]] <- TRUE

  if(is.null(other_prior[["Normalisation"]])){
    other_prior[["Normalisation"]] <- "TMM"
  }
  if(is.null(other_prior[['DEmethod']])){
    other_prior[['DEmethod']] <- "limma-trend"
  }
  simulate_formals <- simutils::change_parameters(function_expr = "powsimR::simulateDE",
                                                  other_prior = other_prior,
                                                  step = "simulation")
  if(verbose){
    message("Simulating datasets using powsimR")
  }
  # Simulation
  simulate_detection <- peakRAM::peakRAM(
    simulate_result <- BiocGenerics::do.call(powsimR::simulateDE, simulate_formals)
  )
  ##############################################################################
  ####                        Format Conversion                              ###
  ##############################################################################
  # Extract count data
  if(length(simulate_result[["Counts"]]) != 1){
    counts <- c()
    for(name in names(simulate_result[["Counts"]])){
      counts <- BiocGenerics::cbind(counts, simulate_result[["Counts"]][[name]][[1]])
    }
  }else{
    counts <- simulate_result[["Counts"]][[1]][[1]]
  }
  # Save the old name
  cell_tmp_name <- colnames(counts)
  gene_tmp_name <- rownames(counts)
  # Rename
  colnames(counts) <- paste0("Cell", c(1:ncol(counts)))
  rownames(counts) <- paste0("Gene", c(1:nrow(counts)))
  # Extract infomation
  if(!is.null(SetupRes[["DESetup"]][["bLFC"]])){
    row_data <- data.frame("gene_name" = rownames(counts),
                           "de_gene" = ifelse(simulate_result[["DESetup"]][["pLFC"]][[1]] == 0, "no", "yes"),
                           "de_fc" = ifelse(simulate_result[["DESetup"]][["pLFC"]][[1]] == 0, 0,
                                            2^simulate_result[["DESetup"]][["pLFC"]][[1]]),
                           "batch_genes" = ifelse(simulate_result[["DESetup"]][["bLFC"]][[1]] == 0, "no", "yes"),
                           "batch_fc" = ifelse(simulate_result[["DESetup"]][["bLFC"]][[1]] ==0, 0,
                                               2^simulate_result[["DESetup"]][["bLFC"]][[1]]))
  }else{
    row_data <- data.frame("gene_name" = rownames(counts),
                           "de_gene" = ifelse(simulate_result[["DESetup"]][["pLFC"]][[1]] == 0, "no", "yes"),
                           "de_fc" = ifelse(simulate_result[["DESetup"]][["pLFC"]][[1]] == 0, 0,
                                            2^simulate_result[["DESetup"]][["pLFC"]][[1]]))
  }
  if(length(simulate_result[["Counts"]]) != 1){
    col_data <- data.frame("cell_name" = colnames(counts),
                           "group" = c(rep("Group1", other_prior[["n1"]][1]),
                                       rep("Group2", other_prior[["n2"]][1]),
                                       rep("Group1", other_prior[["n1"]][2]),
                                       rep("Group2", other_prior[["n2"]][2])),
                           "batch" = rep(paste0("Batch", 1:nbatches), (other_prior[["n1"]] + other_prior[["n2"]])))
  }else{
    col_data <- data.frame("cell_name" = colnames(counts),
                           "group" = paste0("Group",
                                            stringr::str_split(cell_tmp_name, pattern = "_n", simplify = T)[, 2]))
  }
  # SingleCellExperiment object
  simulate_result <- SingleCellExperiment::SingleCellExperiment(list(counts = counts),
                                                                colData = col_data,
                                                                rowData = row_data)
  simulate_result <- simutils::data_conversion(SCE_object = simulate_result,
                                               return_format = return_format)

  ##############################################################################
  ####                           Ouput                                       ###
  ##############################################################################
  simulate_output <- list(simulate_result = simulate_result,
                          simulate_detection = simulate_detection)
  return(simulate_output)
}
duohongrui/simmethods documentation built on June 17, 2024, 10:49 a.m.