#' SimBu: Bias-aware simulation of bulk RNA-seq data with variable cell type composition
#'
#' As complex tissues are typically composed of various cell types, deconvolution tools have been developed to computationally
#' infer their cellular composition from bulk RNA sequencing (RNA-seq) data. To comprehensively assess deconvolution performance,
#' gold-standard datasets are indispensable. The simulation of ‘pseudo-bulk’ data, generated by aggregating single-cell RNA-seq (scRNA-seq)
#' expression profiles in pre-defined proportions, offers a scalable and cost-effective way of generating these gold-standard datasets.
#' SimBu was developed to simulate pseudo-bulk samples based on various simulation scenarios, designed to test specific features of deconvolution
#' methods. A unique feature of SimBu is the modelling of cell-type-specific mRNA bias using experimentally-derived or data-driven scaling factors.
#'
#' @section Dataset generation:
#' You will need an annotated scRNA-seq dataset (as matrix file, h5ad file, Seurat object), which is the baseline for the simulations.
#' Use the dataset_* functions to generate a SummarizedExperiment, that holds all important information.
#' It is also possible to access scRNA-seq datasets through the public database Sfaira, by using the functions dataset_sfaira() and
#' dataset_sfaira_multiple().
#'
#' @section Simulation:
#' Use the simulate_bulk() function to generate multiple pseudo-bulk samples, which will be returned as a SummarizedExperiment. You can
#' adapt the cell type fractions in each sample by changing the scenario parameter.
#'
#' @section Visulaization:
#' Inspect the cell type composition of your simulations with the plot_simulation() function.
#'
#' @docType package
#' @name SimBu
NULL
#> NULL
#' simulate single pseudo-bulk sample
#'
#' function to sample cells according to given cell-type fractions. This creates a single pseudo-bulk sample by calculating the
#' mean expression value per gene over all sampled cells.
#' Note: if total_read_counts is used, the cell-fractions are applied to the number of counts, not the number of cells!
#' @param data \link[SummarizedExperiment]{SummarizedExperiment} object
#' @param scaling_vector vector with scaling values for each cell; calculated by the \code{calc_scaling_vector} function
#' @param simulation_vector named vector with wanted cell-types and their fractions
#' @param total_cells numeric; number of total cells for this simulation
#' @param total_read_counts numeric; sets the total read count value for each sample
#' @param remove_bias_in_counts boolean; if TRUE (default) the internal mRNA bias that is present in count data will be *removed* using the number of reads mapped to each cell
#' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed
#' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE
#' @param seed numeric; fix this value if you want the same cells to be sampled
#'
#' @return returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset
#'
#' @keywords internal
simulate_sample <- function(data,
scaling_vector,
simulation_vector,
total_cells,
total_read_counts,
remove_bias_in_counts,
remove_bias_in_counts_method,
norm_counts,
seed) {
if (!all(names(simulation_vector) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) {
stop("Some cell-types in the provided simulation vector are not in the annotation.")
}
if (!all.equal(sum(simulation_vector), 1)) {
em <- paste0("The sum of the cell-type fractions can not be larger than 1. The sum currently is: ", sum(simulation_vector))
stop(em)
}
# loop over all wanted cell-types and sample to have the final amount
sampled_cells <- lapply(seq_along(simulation_vector), function(x) {
# get all cells with the current type
cells_of_type_x <- data.frame(SummarizedExperiment::colData(data)[SummarizedExperiment::colData(data)[["cell_type"]] == names(simulation_vector[x]), ])
if (simulation_vector[x] == 0) {
cells <- c()
} else {
# how many cells of this type do we need?
n <- round(total_cells * simulation_vector[x])
if (n == 0) {
n <- 1
}
if(!is.na(seed)){
# fix seed for random selection of cells
set.seed(seed)
}
cells <- dplyr::slice_sample(cells_of_type_x, n = n, replace = TRUE)
cells <- cells[["cell_ID"]]
}
return(cells)
})
# annotation
# names(sampled_cells) <- names(simulation_vector)
# simulated_annotation <- utils::stack(sampled_cells)
# get the corresponding columns from the data
data_sampled <- data[, unlist(sampled_cells)]
# remove mRNA bias from count data by dividing count matrix by nReads or nGenes per cell
if (remove_bias_in_counts) {
if (remove_bias_in_counts_method == "read-number") {
remove_scaling_vector <- data_sampled[["nReads_SimBu"]] # access meta data in SummarizedExperiment directly like this
} else if (remove_bias_in_counts_method == "gene-number") {
remove_scaling_vector <- data_sampled[["nGenes_SimBu"]]
} else {
stop('If you want to remove the internal mRNA bias from counts, please specify the method you want to use with remove_bias_in_counts_method. Possible are "read-number" or "gene-number".')
}
# replace count matrix in SummarizedExperiment
count_matrix_removed_bias <- Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["counts"]]) / remove_scaling_vector)
SummarizedExperiment::assays(data_sampled)[["counts"]] <- count_matrix_removed_bias
}
# apply scaling vector on the sampled cells in the matrices
scaling_vector <- scaling_vector[unlist(sampled_cells)]
simulated_count_vector <- NULL
simulated_tpm_vector <- NULL
# apply scaling factor
# & sum up counts/tpms of all cells per gene to get single bulk sample
# & apply possible normalization of sample
simulated_count_vector <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["counts"]]) * scaling_vector))
# if total_read_counts is used, rarefy/scale count data to get the desired sequencing depth
if (!is.null(total_read_counts)) {
sum_counts <- sum(simulated_count_vector)
if (sum_counts > total_read_counts) {
tmp_phylo <- phyloseq::otu_table(data.frame(simulated_count_vector), taxa_are_rows = TRUE)
tmp_vec <- data.frame(phyloseq::rarefy_even_depth(physeq = tmp_phylo, sample.size = total_read_counts, trimOTUs = FALSE, ))[, 1]
names(tmp_vec) <- names(simulated_count_vector)
simulated_count_vector <- tmp_vec
} else if (sum_counts < total_read_counts) {
simulated_count_vector <- simulated_count_vector / sum(simulated_count_vector) * total_read_counts
}
}
# normalize sample based on counts to CPMs
if (norm_counts) {
simulated_count_vector <- (simulated_count_vector / sum(simulated_count_vector)) * 1e6
}
# only generate simulation based on TPM if tpm assay is present
if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
simulated_tpm_vector <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["tpm"]]) * scaling_vector))
# normalize tpm based sample to 1e6
simulated_tpm_vector <- (simulated_tpm_vector / sum(simulated_tpm_vector)) * 1e6
}
return(list(
simulated_count_vector = simulated_count_vector,
simulated_tpm_vector = simulated_tpm_vector
))
}
#' Simulate whole pseudo-bulk RNAseq dataset
#'
#' This function allows you to create a full pseudo-bulk RNAseq dataset. You need to provide a \link[SummarizedExperiment]{SummarizedExperiment} from which the cells
#' will be sampled for the simulation. Also a \code{scenario} has to be selected, where you can choose how the cells will be sampled and a
#' \code{scaling_factor} on how the read counts will be transformed proir to the simulation.
#'
#' @param data (mandatory) \link[SummarizedExperiment]{SummarizedExperiment} object
#' @param scenario (mandatory) select on of the pre-defined cell-type fraction scenarios; possible are: \code{even},\code{random},\code{mirror_db},\code{pure},\code{weighted}; you can also use the \code{custom} scenario, where you need to set the \code{custom_scenario_data} parameter.
#' @param scaling_factor (mandatory) name of scaling factor; possible are: \code{census}, \code{spike_in}, \code{read_number}, \code{expressed_genes}, \code{custom}, \code{epic}, \code{abis}, \code{quantiseq} or \code{NONE} for no scaling factor
#' @param scaling_factor_single_cell boolean: decide if a scaling value for each single cell is calculated (default) or the median of all scaling values for each cell type is calculated
#' @param weighted_cell_type name of cell-type used for \code{weighted} scenario
#' @param weighted_amount fraction of cell-type used for \code{weighted} scenario; must be between \code{0} and \code{0.99}
#' @param pure_cell_type name of cell-type for \code{pure} scenario
#' @param custom_scenario_data dataframe; needs to be of size \code{nsamples} x number_of_cell_types, where each sample is a row and each entry is the cell-type fraction. Rows need to sum up to 1.
#' @param custom_scaling_vector named vector with custom scaling values for cell-types. Cell-types that do not occur in this vector but are present in the dataset will be set to 1; mandatory for \code{custom} scaling factor
#' @param balance_even_mirror_scenario balancing value for the \code{uniform} and \code{mirror_db} scenarios: increasing it will result in more diverse simulated fractions. To get the same fractions in each sample, set to 0. Default is 0.01.
#' @param remove_bias_in_counts boolean; if TRUE the internal mRNA bias that is present in count data will be *removed* using the number of reads mapped to each cell. Default to FALSE
#' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed
#' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE
#' @param nsamples numeric; number of samples in pseudo-bulk RNAseq dataset (default = 100)
#' @param ncells numeric; number of cells in each dataset (default = 1000)
#' @param total_read_counts numeric; sets the total read count value for each sample
#' @param whitelist list; give a list of cell-types you want to keep for the simulation; if NULL, all are used
#' @param blacklist list; give a list of cell-types you want to remove for the simulation; if NULL, all are used; is applied after whitelist
#' @param seed numeric; specifiy a seed for RNG. This effects cell sampling; with a fixed seed you will always sample the same cells for each sample (seed value is incrased by 1 for each sample). Default = NA (two simulation runs will sample different cells).
#' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)
#' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default
#'
#'
#' @return named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset
#' \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample;
#' \code{scaling_vector} scaling value for each cell in dataset
#' @export
#'
#' @examples
#' # generate sample single-cell data to work with:
#'
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#' "ID" = paste0("cell_", rep(1:300)),
#' "cell_type" = c(
#' rep("T cells CD4", 50),
#' rep("T cells CD8", 50),
#' rep("Macrophages", 100),
#' rep("NK cells", 10),
#' rep("B cells", 70),
#' rep("Monocytes", 20)
#' )
#' )
#'
#' dataset <- SimBu::dataset(
#' annotation = annotation,
#' count_matrix = counts,
#' tpm_matrix = tpm,
#' name = "test_dataset"
#' )
#'
#' # this creates a basic pseudo-bulk dataset with uniform cell-type distribution
#' # and no additional transformation of the data with 10 samples and 2000 cells each
#'
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100
#' )
#'
#' # use a blacklist to exclude certain cell-types for the simulation
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 2000,
#' blacklist = c("Monocytes", "Macrophages")
#' )
#'
#'
#' # use the pure scenario to only have B cells
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "pure",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100,
#' pure_cell_type = "B cells"
#' )
#'
#' # simulate a dataset with custom cell-type fraction for each of the 3 samples
#' fractions <- data.frame(
#' "B cells" = c(0.2, 0.4, 0.2),
#' "T cells CD4" = c(0.4, 0.2, 0.1),
#' "Macrophages" = c(0.4, 0.4, 0.7), check.names = FALSE
#' )
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "custom",
#' scaling_factor = "NONE",
#' nsamples = 3,
#' ncells = 2000,
#' custom_scenario_data = fractions
#' )
#'
simulate_bulk <- function(data,
scenario = c("even", "random", "mirror_db", "weighted", "pure", "custom"),
scaling_factor = c("NONE", "census", "spike_in", "custom", "read_number", "expressed_genes", "annotation_column", "epic", "abis", "quantiseq"),
scaling_factor_single_cell = TRUE,
weighted_cell_type = NULL,
weighted_amount = NULL,
pure_cell_type = NULL,
custom_scenario_data = NULL,
custom_scaling_vector = NULL,
balance_even_mirror_scenario = 0.01,
remove_bias_in_counts = FALSE,
remove_bias_in_counts_method = "read-number",
norm_counts = FALSE,
nsamples = 100,
ncells = 1000,
total_read_counts = NULL,
whitelist = NULL,
blacklist = NULL,
seed = NA,
BPPARAM = BiocParallel::bpparam(),
run_parallel = FALSE) {
# switch multi-threading on/off
if (!run_parallel) {
BPPARAM <- BiocParallel::MulticoreParam(workers = 1)
} else {
message("Using parallel generation of simulations.")
}
# keep only cell-types which are in whitelist in annotation & count matrix
if (!is.null(whitelist)) {
if (!all(whitelist %in% SummarizedExperiment::colData(data)[["cell_type"]])) {
stop("Did not find all cell-types of whitelist in annotation.")
}
data <- data[, SummarizedExperiment::colData(data)[SummarizedExperiment::colData(data)[["cell_type"]] %in% whitelist, ][["cell_ID"]]]
if (dim(data)[2] == 0) {
stop("No cells are left after using this whitelist; please check that the correct names are used.")
}
}
# remove cell-types which are in blacklist from annotation & count matrix
if (!is.null(blacklist)) {
if (!all(blacklist %in% SummarizedExperiment::colData(data)[["cell_type"]])) {
stop("Did not find all cell-types of blacklist in annotation.")
}
data <- data[, SummarizedExperiment::colData(data)[!SummarizedExperiment::colData(data)[["cell_type"]] %in% blacklist, ][["cell_ID"]]]
if (dim(data)[2] == 0) {
stop("No cells are left after using this blacklist; please check that the correct names are used.")
}
}
##### different cell-type scenarios #####
# each existing cell-type will be appearing in equal amounts
if (scenario == "even") {
all_types <- unique(SummarizedExperiment::colData(data)[["cell_type"]])
n_cell_types <- length(all_types)
simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
m <- round(matrix(abs(stats::rnorm(length(all_types), mean = 1 / length(all_types), sd = balance_even_mirror_scenario)), ncol = n_cell_types), 3)
m <- sweep(m, 1, rowSums(m), FUN = "/")
simulation_vector <- as.vector(m[1, ])
names(simulation_vector) <- all_types
return(simulation_vector)
})
# give each sample a name
sample_names <- paste0("even_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
# generate random cell-type fractions (depending on appearance in database)
if (scenario == "random") {
# generate 'nsamples' random samples
simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]]))
# generate n_cell_type amount of random fractions from the uniform distribution, which will sum up to 1
m <- round(matrix(stats::runif(n_cell_types, 0, 1), ncol = n_cell_types), 3)
m <- sweep(m, 1, rowSums(m), FUN = "/")
simulation_vector <- as.vector(m[1, ])
names(simulation_vector) <- unique(SummarizedExperiment::colData(data)[["cell_type"]])
return(simulation_vector)
})
sample_names <- paste0("random_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
# generate cell-type fractions, which mirror the fraction of each cell-type in the used dataset
if (scenario == "mirror_db") {
n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]]))
# generate 'nsamples' random samples
simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
# each cell-type will be represented as many times as it occurs in the used dataset
mirror_values <- table(SummarizedExperiment::colData(data)[["cell_type"]]) / ncol(data)
m <- unlist(lapply(mirror_values, function(y) {
return(abs(round(stats::rnorm(1, mean = y, sd = balance_even_mirror_scenario), 3)))
}))
m <- matrix(m, ncol = n_cell_types)
m <- sweep(m, 1, rowSums(m), FUN = "/")
simulation_vector <- as.vector(m[1, ])
names(simulation_vector) <- names(mirror_values)
return(simulation_vector)
})
sample_names <- paste0("mirror_db_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
# one cell-type will be highly over represented, the others are random
if (scenario == "weighted") {
if (is.null(weighted_cell_type) || is.null(weighted_amount)) {
stop("The weighted scenario requires you to select one cell-type which will be over represented")
}
if (weighted_amount > 0.99 || weighted_amount < 0) {
stop("The weighted cell-type fraction needs to be between 0 and 0.99.")
}
if (!weighted_cell_type %in% unique(SummarizedExperiment::colData(data)[["cell_type"]])) {
stop("The weighted cell-type could not be found in your dataset.")
}
random_cell_types <- setdiff(unique(SummarizedExperiment::colData(data)[["cell_type"]]), weighted_cell_type)
all_cell_types <- c(random_cell_types, weighted_cell_type)
simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]])) - 1
# generate n_cell_type amount of random fractions from the uniform distribution, which will sum up to 1
m <- matrix(stats::runif(n_cell_types, 0, 1), ncol = n_cell_types)
simulation_vector <- as.vector(m[1, ])
simulation_vector <- (1 - weighted_amount) * simulation_vector / sum(simulation_vector)
simulation_vector <- append(simulation_vector, weighted_amount)
names(simulation_vector) <- all_cell_types
return(simulation_vector)
})
sample_names <- paste0("weighted_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
# pure: only simulate a single cell-type
if (scenario == "pure") {
if (is.null(pure_cell_type)) {
stop("The pure scenario requires you to select a cell-type which will be simulated")
}
simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
simulation_vector <- c(1)
names(simulation_vector) <- as.character(pure_cell_type)
return(simulation_vector)
})
sample_names <- paste0("pure_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
# custom fractions
if (scenario == "custom") {
if (is.null(custom_scenario_data)) {
stop("You need to provide a dataframe with you custom cell-type fractions in this scenario with the custom_scenario_data parameter.")
}
# check dimensions and cell-types in dataframe; samples are rows
if (nrow(custom_scenario_data) != nsamples) {
stop("The scenario data has a differnt amount of samples than you want to simulate.")
}
if (!all(colnames(custom_scenario_data) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) {
stop("Could not find all cell-types from scenario data in annotation.")
}
simulation_vector_list <- as.list(as.data.frame(t(custom_scenario_data)))
simulation_vector_list <- lapply(simulation_vector_list, function(x) {
names(x) <- colnames(custom_scenario_data)
x <- x
})
sample_names <- paste0("custom_sample", seq_len(nsamples))
names(simulation_vector_list) <- sample_names
}
##### pre-calculate scaling factors ####
scaling_vector <- calc_scaling_vector(
data = data,
scaling_factor = scaling_factor,
custom_scaling_vector = custom_scaling_vector,
scaling_factor_single_cell = scaling_factor_single_cell,
BPPARAM = BPPARAM,
run_parallel = run_parallel
)
##### generate the samples #####
# sample cells and generate pseudo-bulk profiles
all_samples <- BiocParallel::bplapply(seq_along(simulation_vector_list), function(i) {
simulation_vector <- simulation_vector_list[[i]]
sample <- simulate_sample(
data = data,
scaling_vector = scaling_vector,
simulation_vector = simulation_vector,
total_cells = ncells,
total_read_counts = total_read_counts,
remove_bias_in_counts = remove_bias_in_counts,
remove_bias_in_counts_method = remove_bias_in_counts_method,
norm_counts = norm_counts,
seed = ifelse(is.na(seed), seed, seed + i) # ensure that samples still are different if seed is set
)
return(sample)
}, BPPARAM = BPPARAM)
bulk_counts <- Matrix::Matrix(vapply(
X = all_samples,
FUN = "[[",
... = 1,
FUN.VALUE = double(dim(SummarizedExperiment::assays(data)[["counts"]])[1])
),
sparse = TRUE
)
if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
bulk_tpm <- Matrix::Matrix(vapply(
X = all_samples,
FUN = "[[",
... = 2,
FUN.VALUE = double(dim(SummarizedExperiment::assays(data)[["tpm"]])[1])
),
sparse = TRUE
)
assays <- list(bulk_counts = bulk_counts, bulk_tpm = bulk_tpm)
} else {
assays <- list(bulk_counts = bulk_counts)
}
# new SummarizedExperiment with bulk datasets based on counts or TPM or both
se_bulk <- SummarizedExperiment::SummarizedExperiment(assays = assays)
colnames(se_bulk) <- sample_names
# remove non-unique features/genes from simulated dataset
if (length(unique(rownames(se_bulk))) != dim(se_bulk)[1]) {
un <- unique(rownames(se_bulk))
se_bulk <- se_bulk[un, ]
}
# cell_fractions for all simulated samples
cell_fractions <- data.frame(t(data.frame(simulation_vector_list)), check.names = FALSE)
message("Finished simulation.")
return(list(
bulk = se_bulk,
cell_fractions = cell_fractions,
scaling_vector = scaling_vector
))
}
#' Calculate scaling factor for a dataset
#'
#' Each scaling factor has a default matrix it will try to use (counts or TPM). If the required matrix is not available, the other one is used and a warning is given.
#'
#' @param data dataset object
#' @param scaling_factor name of scaling factor; possible are: \code{census}, \code{spike_in}, \code{read_number}, \code{custom} or \code{NONE} for no scaling factor
#' @param custom_scaling_vector named vector with custom scaling values for cell-types. Cell-types that do not occur in this vector but are present in the dataset will be set to 1
#' @param scaling_factor_single_cell boolean: decide if a scaling value for each single cell is calculated (default) or the median of all scaling values for each cell type is calculated
#' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)
#' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default
#'
#' @return a named vector with a scaling value for each cell in the dataset
#'
#' @keywords internal
calc_scaling_vector <- function(data, scaling_factor, custom_scaling_vector, scaling_factor_single_cell, BPPARAM, run_parallel) {
if (scaling_factor == "census") {
if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
m <- SummarizedExperiment::assays(data)[["tpm"]]
} else {
warning("Scaling factor 'Census' requires TPM data, which is not available in the given dataset. Counts will be used instead.")
m <- SummarizedExperiment::assays(data)[["counts"]]
}
scaling_vector <- census(m, expr_threshold = 0.1, exp_capture_rate = .25, BPPARAM = BPPARAM, run_parallel = run_parallel)
scaling_vector[which(is.na(scaling_vector))] <- 1
scaling_vector <- scaling_vector / 1e6
} else if (scaling_factor == "spike_in") {
# if you want to transform your counts by spike_in data, an additional column in the annotation table is needed
# with name "spike_in"; the matrix counts will then be transformed accordingly
if (!"spike_in" %in% colnames(SummarizedExperiment::colData(data))) {
stop("No column with spike_in information in annotation data. Check your dataset again!")
}
# get subset of spike_in counts from the sampled cells
anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "spike_in", "nReads_SimBu")]
tot_counts <- anno_sub[["nReads_SimBu"]] + anno_sub[["spike_in"]] # these are all the reads mapped to transcripts *and* spike-ins
scaling_vector <- anno_sub[["nReads_SimBu"]] / tot_counts # this is the fractions of reads mapped to 'true' transcripts
names(scaling_vector) <- anno_sub[["cell_ID"]]
} else if (scaling_factor == "read_number") {
anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "nReads_SimBu")]
scaling_vector <- anno_sub[["nReads_SimBu"]]
names(scaling_vector) <- anno_sub[["cell_ID"]]
} else if (scaling_factor == "expressed_genes") {
anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "nGenes_SimBu")]
scaling_vector <- anno_sub[["nGenes_SimBu"]]
names(scaling_vector) <- anno_sub[["cell_ID"]]
} else if (scaling_factor == "custom") {
# needs vector with values for existing cell-types
# cell-types that do not occur in this vector will have scaling-factor of 1
if (is.null(custom_scaling_vector)) {
stop("For the custom scaling factor you need to provide a custom_scaling_vector!")
}
message("Using custom scaling factors.")
scaling_vector <- merge_scaling_factor(
data = data,
scaling_factor_values = custom_scaling_vector,
scaling_factor_name = scaling_factor
)
} else if (scaling_factor == "NONE") {
scaling_vector <- rep(1, ncol(data))
names(scaling_vector) <- SummarizedExperiment::colData(data)[["cell_ID"]]
} else if (scaling_factor == "epic") {
message("Using EPIC scaling factors.")
scaling_vector <- merge_scaling_factor(
data = data,
scaling_factor_values = pkg.globals$epic_scaling,
scaling_factor_name = scaling_factor
)
} else if (scaling_factor == "abis") {
message("Using ABIS scaling factors.")
scaling_vector <- merge_scaling_factor(
data = data,
scaling_factor_values = pkg.globals$abis_scaling,
scaling_factor_name = scaling_factor
)
} else if (scaling_factor == "quantiseq") {
message("Using quanTIseq scaling factors.")
scaling_vector <- merge_scaling_factor(
data = data,
scaling_factor_values = pkg.globals$qtsq_scaling,
scaling_factor_name = scaling_factor
)
} else if (!is.null(scaling_factor)) {
message("Scaling by custom column in annotation table; if no scaling is wished instead, use 'NONE'.")
if (!scaling_factor %in% colnames(SummarizedExperiment::colData(data))) {
em <- paste0("A column with the name ", scaling_factor, " cannot be found in the annotation.")
stop(em)
}
if (!is.numeric(SummarizedExperiment::colData(data)[, c(scaling_factor)])) {
em <- paste0("The column with the name ", scaling_factor, " is not numeric and cannot be used for scaling.")
stop(em)
}
anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", scaling_factor)]
colnames(anno_sub) <- c("a", "b")
scaling_vector <- anno_sub$b
names(scaling_vector) <- anno_sub$a
} else {
warning("No valid scaling factor method provided. Scaling all cells by 1.")
scaling_vector <- rep(1, ncol(data))
names(scaling_vector) <- SummarizedExperiment::colData(data)[["cell_ID"]]
}
# calculate the median scaling value per cell-type
if (!scaling_factor_single_cell) {
df <- data.frame(
value = scaling_vector,
type = SummarizedExperiment::colData(data)[["cell_type"]],
id = names(scaling_vector)
)
df$median_value <- stats::ave(df$value, df$type, FUN = stats::median)
scaling_vector <- df[["median_value"]]
names(scaling_vector) <- df[["id"]]
}
return(scaling_vector)
}
# normalize samples to one million -> TPM
cpm_normalize <- function(matrix) {
m <- Matrix::t(1e6 * Matrix::t(matrix) / Matrix::colSums(matrix))
if (sum(is.na(m)) > 0) {
warning("There are NA entries in the normalized matrix.")
}
return(m)
}
#' Save the expression matrix of a simulated pseudo-bulk dataset to a file
#'
#' @param simulation the result of simulate_bulk()
#' @param filename the filename where to save the expression matrix to
#' @param assay name of the assay in simulation to save, default to bulk_counts
#'
#' @return write a file
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#' "ID" = paste0("cell_", rep(1:300)),
#' "cell_type" = c(
#' rep("T cells CD4", 50),
#' rep("T cells CD8", 50),
#' rep("Macrophages", 100),
#' rep("NK cells", 10),
#' rep("B cells", 70),
#' rep("Monocytes", 20)
#' )
#' )
#'
#' dataset <- SimBu::dataset(
#' annotation = annotation,
#' count_matrix = counts,
#' tpm_matrix = tpm,
#' name = "test_dataset"
#' )
#'
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100
#' )
#'
#' save_simulation(s, tempfile())
save_simulation <- function(simulation, filename, assay = "bulk_counts") {
utils::write.table(as.matrix(SummarizedExperiment::assays(simulation$bulk)[[assay]]), filename, quote = FALSE, sep = "\t")
}
#' Plot the cell-type fractions in your simulated dataset
#'
#' @param simulation a simulation object generated by \code{simulate_bulk}
#'
#' @return a gpplot2 barplot
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#' "ID" = paste0("cell_", rep(1:300)),
#' "cell_type" = c(
#' rep("T cells CD4", 50),
#' rep("T cells CD8", 50),
#' rep("Macrophages", 100),
#' rep("NK cells", 10),
#' rep("B cells", 70),
#' rep("Monocytes", 20)
#' )
#' )
#'
#' dataset <- SimBu::dataset(
#' annotation = annotation,
#' count_matrix = counts,
#' tpm_matrix = tpm,
#' name = "test_dataset"
#' )
#'
#' s <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100
#' )
#'
#' SimBu::plot_simulation(s)
#'
plot_simulation <- function(simulation) {
fractions <- simulation$cell_fractions
fractions$sample <- factor(rownames(fractions), levels = rownames(fractions))
frac_long <- tidyr::gather(fractions, "cell_type", "fraction", seq_len(length(fractions) - 1))
ggplot2::ggplot(data = frac_long) +
ggplot2::geom_col(ggplot2::aes_string(x = "fraction", y = "sample", fill = "cell_type")) +
ggplot2::ggtitle(paste0("Cell-type fractions for \n", nrow(fractions), " pseudo-bulk RNAseq samples")) +
ggplot2::scale_fill_manual(values = grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Paired"))(length(unique(frac_long$cell_type)))) +
ggplot2::theme_bw()
}
#' Combine multiple simulations into one result
#'
#' we recommend to only merge simulations from the same dataset object, otherwise the count matrices might not correspond on the gene level
#'
#' @param simulation_list a list of simulations
#'
#' @return named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset
#' \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample;
#' \code{scaling_vector} scaling value for each cell in dataset
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#' "ID" = paste0("cell_", rep(1:300)),
#' "cell_type" = c(
#' rep("T cells CD4", 50),
#' rep("T cells CD8", 50),
#' rep("Macrophages", 100),
#' rep("NK cells", 10),
#' rep("B cells", 70),
#' rep("Monocytes", 20)
#' )
#' )
#'
#' dataset <- SimBu::dataset(
#' annotation = annotation,
#' count_matrix = counts,
#' tpm_matrix = tpm,
#' name = "test_dataset"
#' )
#'
#' s1 <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100
#' )
#'
#' s2 <- SimBu::simulate_bulk(dataset,
#' scenario = "even",
#' scaling_factor = "NONE",
#' nsamples = 10,
#' ncells = 100
#' )
#'
#' s <- SimBu::merge_simulations(list(s1, s2))
merge_simulations <- function(simulation_list) {
# merge SummarizedExperiments
if (length(simulation_list) <= 1) {
stop("You need at least 2 simulations to merge them into one!")
}
n_assays <- unlist(lapply(simulation_list, function(x) {
return(length(names(SummarizedExperiment::assays(x$bulk))))
}))
if (length(unique(n_assays)) != 1) {
stop("The simulations you want to merge have different numbers of assays. Stopping.")
}
merged_se <- do.call(SummarizedExperiment::cbind, lapply(simulation_list, "[[", "bulk"))
# merge cell fractions dataframe
sample_names <- unlist(lapply(simulation_list, function(x) {
rownames(x[["cell_fractions"]])
}))
sample_names <- paste0(sample_names, "_", seq_len(length(sample_names)))
cell_fractions <- data.frame(data.table::rbindlist(lapply(simulation_list, function(x) {
x[["cell_fractions"]]
}), fill = TRUE, use.names = TRUE), check.names = FALSE)
rownames(cell_fractions) <- sample_names
cell_fractions[is.na(cell_fractions)] <- 0
# list of scaling vectors
scaling_vector <- lapply(
X = simulation_list,
FUN = "[[",
... = "scaling_vector"
)
names(scaling_vector) <- names(simulation_list)
return(list(
bulk = merged_se,
cell_fractions = cell_fractions,
scaling_vector = scaling_vector
))
}
#' Create scaling vector from custom or pre-defined scaling factor
#'
#' @param data dataset
#' @param scaling_factor_values named list of scaling values
#' @param scaling_factor_name name of scaling factor method
#'
#' @return scaling vector
#'
#' @keywords internal
merge_scaling_factor <- function(data, scaling_factor_values, scaling_factor_name) {
# these are the name of cell types in dataset
present_types <- unique(data[["cell_type"]])
# names of cell types in dataset and scaling factor
type_intersection <- Reduce(intersect, list(present_types, names(scaling_factor_values)))
if (length(type_intersection) == 0) {
em <- paste0("Could not apply ", scaling_factor_name, " scaling factor, because none of its cell types are present in the provided dataset.")
stop(em)
}
# names of cell types which are in scaling factor, but not in dataset
missing_types <- present_types[which(!present_types %in% names(scaling_factor_values))]
if (length(missing_types) != 0) {
wm <- paste0("For some cell type(s) in the dataset, no scaling factor is available when using ", scaling_factor_name, ": ", missing_types, ". This cell type will not be re-scaled.")
warning(wm)
}
# missing types are not scaled -> value of 1
complete_vector <- rep(1, length(missing_types))
names(complete_vector) <- missing_types
# combine with scaling values of factor, which correspond to cell types present in dataset
complete_vector <- data.frame(value = append(complete_vector, scaling_factor_values[type_intersection]), check.names = FALSE)
# give each cell the corresponding scaling value, depending on its type
df <- merge(complete_vector, data.frame(SummarizedExperiment::colData(data)), by.x = 0, by.y = "cell_type", all.y = TRUE)[, c("value", "cell_ID")]
scaling_vector <- df$value
names(scaling_vector) <- df$cell_ID
return(scaling_vector)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.