#' Estimate Parameters From Real Datasets by SCRIP
#'
#' This function is used to estimate useful parameters from a real dataset by
#' using `splatEstimate` function in Splatter package.
#'
#' @param ref_data A count matrix. Each row represents a gene and each column
#' represents a cell.
#' @param verbose Logical.
#' @param seed An integer of a random seed.
#' @importFrom peakRAM peakRAM
#' @importFrom splatter splatEstimate
#'
#' @return A list contains the estimated parameters and the results of execution
#' detection.
#' @export
#' @references
#' Qin F, Luo X, Xiao F, et al. SCRIP: an accurate simulator for single-cell RNA sequencing data. Bioinformatics, 2022, 38(5): 1304-1311. <https://doi.org/10.1093/bioinformatics/btab824>
#'
#' CRAN URL: <https://cran.r-project.org/web/packages/SCRIP/index.html>
#'
#' Github URL: <https://github.com/thecailab/SCRIP>
#' @examples
#' \dontrun{
#' ref_data <- simmethods::data
#' estimate_result <- simmethods::SCRIP_estimation(ref_data = ref_data,
#' verbose = TRUE,
#' seed = 10)
#' estimate_result <- estimate_result[["estimate_result"]]
#' ## Check the class (SCRIP accepts the estimation procedure in Splat)
#' class(estimate_result) == "SplatParams"
#' }
#'
SCRIP_estimation <- function(ref_data,
verbose = FALSE,
seed
){
##############################################################################
#### Check ###
##############################################################################
if(!is.matrix(ref_data)){
ref_data <- as.matrix(ref_data)
}
##############################################################################
#### Estimation ###
##############################################################################
if(verbose){
message("Estimating parameters using SCRIP")
}
# Seed
set.seed(seed)
# Estimation
estimate_detection <- peakRAM::peakRAM(
estimate_result <- splatter::splatEstimate(ref_data)
)
##############################################################################
#### Ouput ###
##############################################################################
estimate_output <- list(estimate_result = estimate_result,
estimate_detection = estimate_detection)
return(estimate_output)
}
#' Simulate Datasets by SCRIP
#'
#' This function is used to simulate datasets from learned parameters by `SCRIPsimu`
#' function in SCRIP package.
#'
#' @param parameters A object generated by [splatter::splatEstimate()]
#' @param ref_data A matrix for one dataset or a list of datasets with their own
#' names. This is usually unused except for some methods, e.g. SCRIP, scDesign,
#' zingeR.
#' @param other_prior A list with names of certain parameters. Some methods need
#' extra parameters to execute the estimation step, so you must input them. In
#' simulation step, the number of cells, genes, groups, batches, the percent of
#' DEGs are usually customed, so before simulating a dataset you must point it out.
#' See `Details` below for more information.
#' @param return_format A character. Alternative choices: list, SingleCellExperiment,
#' Seurat, h5ad. If you select `h5ad`, you will get a path where the .h5ad file saves to.
#' @param verbose Logical. Whether to return messages or not.
#' @param seed A random seed.
#' @importFrom devtools install_github
#' @export
#' @details
#' SCRIP contains five different simulation modes, and you can specify which mode
#' do you use (default is GP-trendedBCV):
#' 1. GP-trendedBCV
#' 2. GP-commonBCV
#' 3. BGP-commonBCV
#' 4. BP
#' 5. BGP-trendedBCV
#'
#' In addtion to simulate datasets with default parameters, users want to simulate
#' other kinds of datasets, e.g. a counts matrix with 2 or more cell groups. In
#' SCRIP, you can set extra parameters to simulate datasets.
#'
#' The customed parameters you can set are below:
#' 1. nCells. In SCRIP, you can not set nCells directly and should set batchCells instead. For example, if you want to simulate 1000 cells, you can type `other_prior = list(batchCells = 1000)`. If you type `other_prior = list(batchCells = c(500, 500))`, the simulated data will have two batches.
#' 2. nGenes. You can directly set `other_prior = list(nGenes = 5000)` to simulate 5000 genes.
#' 3. nGroups. You can not directly set `other_prior = list(nGroups = 3)` to simulate 3 groups. Instead, you should set `other_prior = list(prob.group = c(0.2, 0.3, 0.5))` where the sum of group probabilities must equal to 1.
#' 4. de.prob. You can directly set `other_prior = list(de.prob = 0.2)` to simulate DEGs that account for 20 percent of all genes.
#' 5. prob.group. You can directly set `other_prior = list(prob.group = c(0.2, 0.3, 0.5))` to assign three proportions of cell groups. Note that the number of groups always equals to the length of the vector.
#' 6. nBatches. You can not directly set `other_prior = list(nBatches = 3)` to simulate 3 batches. Instead, you should set `other_prior = list(batchCells = c(500, 500, 500))` to reach the goal and the total cells are 1500.
#'
#' For more customed parameters in SCRIP, please check [SCRIP::SCRIPsimu()].
#' @references
#' Qin F, Luo X, Xiao F, et al. SCRIP: an accurate simulator for single-cell RNA sequencing data. Bioinformatics, 2022, 38(5): 1304-1311. <https://doi.org/10.1093/bioinformatics/btab824>
#'
#' CRAN URL: <https://cran.r-project.org/web/packages/SCRIP/index.html>
#'
#' Github URL: <https://github.com/thecailab/SCRIP>
#' @examples
#' \dontrun{
#' # Load data
#' ref_data <- simmethods::data
#' # Estimate parameters
#' estimate_result <- simmethods::SCRIP_estimation(ref_data = data,
#' verbose = TRUE,
#' seed = 10)
#'
#' # (1) Simulate 500 cells (Since we can not set nCells directly, so we can set
#' # batchCells (a numeric vector)) and 2000 genes
#' simulate_result <- simmethods::SCRIP_simulation(parameters = estimate_result[["estimate_result"]],
#' ref_data = ref_data,
#' other_prior = list(batchCells = 500,
#' nGenes = 2000),
#' return_format = "list",
#' verbose = TRUE,
#' seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#'
#'
#' # (2) Simulate by another mode
#' simulate_result <- simmethods::SCRIP_simulation(parameters = estimate_result[["estimate_result"]],
#' ref_data = ref_data,
#' other_prior = list(mode = "BP"),
#' return_format = "list",
#' verbose = TRUE,
#' seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#'
#'
#' # (3) Simulate two groups (de.prob = 0.1) and one batch
#' simulate_result <- simmethods::SCRIP_simulation(parameters = estimate_result[["estimate_result"]],
#' ref_data = ref_data,
#' other_prior = list(prob.group = c(0.4, 0.6)),
#' return_format = "list",
#' verbose = TRUE,
#' seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' ## cell information
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' ## gene information
#' row_data <- simulate_result[["simulate_result"]][["row_meta"]]
#' ### The result of SCRIP contains the factors of different groups and uses can
#' ### calculate the fold change by division. For example, the DEFactors of A gene
#' ### in Group1 and Group2 are respectively 2 and 1, and the fold change of A gene
#' ### is 2/1=2 or 1/2=0.5.
#' fc_group1_to_group2 <- row_data$DEFacGroup2/row_data$DEFacGroup1
#' table(fc_group1_to_group2 != 1)[2]/4000 ## de.prob = 0.1
#' ### number of all DEGs
#' table(row_data$de_gene)[2]/4000
#'
#'
#' # (4) Simulate two groups (de.prob = 0.2) and two batches
#' ## Since we can not set nBatches directly, so we can set batchCells (a numeric vector)
#' ## to determin the number of batches and cells simutaniously.
#' simulate_result <- simmethods::SCRIP_simulation(parameters = estimate_result[["estimate_result"]],
#' ref_data = ref_data,
#' other_prior = list(prob.group = c(0.4, 0.6),
#' batchCells = c(80, 80),
#' de.prob = 0.2),
#' return_format = "list",
#' verbose = TRUE,
#' seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' table(col_data$batch)
#'
#'
#' # (5) Simulate three groups (de.prob = 0.2) and two batches
#' ## Since we can not set nBatches directly, so we can set batchCells (a numeric vector)
#' ## to determin the number of batches and cells simutaniously.
#' simulate_result <- simmethods::SCRIP_simulation(parameters = estimate_result[["estimate_result"]],
#' ref_data = ref_data,
#' other_prior = list(prob.group = c(0.4, 0.3, 0.3),
#' batchCells = c(80, 80),
#' de.prob = 0.2),
#' return_format = "list",
#' verbose = TRUE,
#' seed = 111)
#' count_data <- simulate_result[["simulate_result"]][["count_data"]]
#' dim(count_data)
#' col_data <- simulate_result[["simulate_result"]][["col_meta"]]
#' table(col_data$group)
#' table(col_data$batch)
#' ## row data
#' row_data <- simulate_result[["simulate_result"]][["row_meta"]]
#' ### DEGsj
#' table(row_data$de_gene)
#' ### fold change of Group1 to Group2
#' fc_group1_to_group2 <- row_data$DEFacGroup2/row_data$DEFacGroup1
#' table(fc_group1_to_group2 != 1)[2]/4000
#' ### fold change of Group1 to Group3
#' fc_group1_to_group3 <- row_data$DEFacGroup3/row_data$DEFacGroup1
#' table(fc_group1_to_group3 != 1)[2]/4000
#' ### fold change of Group2 to Group3
#' fc_group2_to_group3 <- row_data$DEFacGroup3/row_data$DEFacGroup2
#' table(fc_group2_to_group3 != 1)[2]/4000
#'
#' # (6) Simulate continous cell trajectory
#' simulate_result <- simmethods::SCRIP_simulation(ref_data = ref_data,
#' parameters = estimate_result[["estimate_result"]],
#' other_prior = list(batchCells = 1000,
#' nGenes = 1000,
#' paths = TRUE),
#' return_format = "SingleCellExperiment",
#' verbose = TRUE,
#' seed = 111)
#' ## plot
#' result <- scater::logNormCounts(simulate_result[["simulate_result"]])
#' result <- scater::runPCA(result)
#' plotPCA(result, colour_by = "group")
#'
#' # (7) Simulate continous cell trajectory (three groups)
#' simulate_result <- simmethods::SCRIP_simulation(ref_data = ref_data,
#' parameters = estimate_result[["estimate_result"]],
#' other_prior = list(batchCells = 1000,
#' nGenes = 1000,
#' paths = TRUE,
#' prob.group = c(0.2, 0.4, 0.4),
#' de.prob = 0.4),
#' return_format = "SingleCellExperiment",
#' verbose = TRUE,
#' seed = 111)
#' ## plot
#' result <- scater::logNormCounts(simulate_result[["simulate_result"]])
#' result <- scater::runPCA(result)
#' plotPCA(result, colour_by = "group")
#' }
#'
SCRIP_simulation <- function(parameters,
ref_data = NULL,
other_prior = NULL,
return_format,
verbose = FALSE,
seed
){
##############################################################################
#### Environment ###
##############################################################################
if(!requireNamespace("SCRIP", quietly = TRUE)){
message("SCRIP is not installed on your device")
message("Installing SCRIP")
devtools::install_github("thecailab/SCRIP")
}
##############################################################################
#### Check ###
##############################################################################
assertthat::assert_that(class(parameters) == "SplatParams")
# Check datasets
if(is.null(ref_data)){
stop("Please input reference data to simulate new datasets!")
}
# Change parameters
if(!is.null(other_prior)){
parameters <- simutils::set_parameters(parameters = parameters,
other_prior = other_prior,
method = "SCRIP")
}
if(!is.null(other_prior[["prob.group"]])){
parameters <- splatter::setParam(parameters,
name = "group.prob",
value = other_prior[["prob.group"]])
}
# Get params to check
params_check <- splatter::getParams(parameters, c("nBatches",
"batchCells",
"nGroups",
"group.prob",
"de.prob",
"nCells",
"nGenes"))
# check batches
if(length(params_check[["batchCells"]]) > 1){
assertthat::assert_that(params_check[["nBatches"]] > 1)
}else{
assertthat::assert_that(params_check[["nBatches"]] == 1)
}
# check groups
if(length(params_check[["group.prob"]]) > 1){
assertthat::assert_that(params_check[["nGroups"]] > 1)
}else{
assertthat::assert_that(params_check[["nGroups"]] == 1)
}
# DEGs proportion
de.prob <- params_check[["de.prob"]]
# Return to users
message(paste0("nCells: ", params_check[['nCells']]))
message(paste0("nGenes: ", params_check[['nGenes']]))
message(paste0("nGroups: ", params_check[['nGroups']]))
message(paste0("de.prob: ", de.prob))
message(paste0("nBatches: ", params_check[['nBatches']]))
##############################################################################
#### Simulation ###
##############################################################################
if(verbose){
message("Simulating datasets using SCRIP")
}
# Seed
parameters <- splatter::setParam(parameters, name = "seed", value = seed)
parameters <- splatter::setParam(parameters,
name = "de.prob",
value = de.prob/params_check[['nGroups']])
# mode
if(is.null(other_prior[["mode"]])){
mode <- "GP-trendedBCV"
}else{
mode <- other_prior[["mode"]]
}
# Simulation
if(!is.null(other_prior[["paths"]])){
cat("Simulating trajectory datasets by SCRIP")
submethod <- "paths"
if(!is.null(other_prior[["path.from"]])){
parameters <- splatter::setParam(parameters,
name = "path.from",
value = other_prior[["path.from"]])
}else{
parameters <- splatter::setParam(parameters,
name = "path.from",
value = seq(1:params_check[['nGroups']])-1)
}
}else{
if(params_check[["nGroups"]] == 1){
submethod <- "single"
}else if(params_check[["nGroups"]] != 1){
submethod <- "groups"
}
}
set.seed(seed)
simulate_detection <- peakRAM::peakRAM(
simulate_result <- SCRIP::SCRIPsimu(data = ref_data,
params = parameters,
method = submethod,
mode = mode)
)
##############################################################################
#### Format Conversion ###
##############################################################################
# counts
counts <- as.matrix(SingleCellExperiment::counts(simulate_result))
# col_data
col_data <- as.data.frame(SummarizedExperiment::colData(simulate_result))
if(params_check[['nGroups']] == 1){
col_data[, 3] <- rep("Group1", ncol(simulate_result))
}else{
if(!is.null(other_prior[["paths"]])){
col_data$Group <- stringr::str_replace_all(col_data$Group,
pattern = "Path",
replacement = "Group")
col_data <- col_data[, -c(4, 5)]
}else{
col_data <- col_data[, -4]
}
}
colnames(col_data) <- c("cell_name", "batch", "group")
# row_data
row_data <- as.data.frame(SummarizedExperiment::rowData(simulate_result))
if(params_check[['nGroups']] == 1){
row_data <- data.frame("gene_name" = rownames(counts))
rownames(row_data) <- rownames(counts)
}else{
group_fac <- row_data[, grep(colnames(row_data), pattern = "^DE")]
batch_fac <- row_data[, grep(colnames(row_data), pattern = "^Batch")]
total_sum <- rowSums(group_fac)
de_gene <- ifelse(total_sum == params_check[['nGroups']], "no", "yes")
row_data[, 2] <- de_gene
if(!is.null(other_prior[["paths"]])){
row_data <- row_data[, -c(3:4, (ncol(row_data)-ncol(group_fac)+1):ncol(row_data))]
colnames(row_data) <- c("gene_name", "de_gene", colnames(batch_fac), paste0("DEFacGroup", 1:ncol(group_fac)))
}else{
row_data <- row_data[, -c(3:4)]
colnames(row_data) <- c("gene_name", "de_gene", colnames(batch_fac), colnames(group_fac))
}
}
# Establish SingleCellExperiment
simulate_result <- SingleCellExperiment::SingleCellExperiment(list(counts = counts),
colData = col_data,
rowData = row_data)
simulate_result <- simutils::data_conversion(SCE_object = simulate_result,
return_format = return_format)
##############################################################################
#### Ouput ###
##############################################################################
simulate_output <- list(simulate_result = simulate_result,
simulate_detection = simulate_detection)
return(simulate_output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.