Nothing
#' @importFrom ggplot2 ggplot aes geom_violin geom_boxplot geom_line theme ggtitle element_text
NULL
################################################################################
####################### Generate cell composition matrix #######################
################################################################################
#' Generate training and test cell composition matrices
#'
#' Generate training and test cell composition matrices for the simulation of
#' pseudo-bulk RNA-Seq samples with known cell composition using single-cell
#' expression profiles. The resulting \code{\linkS4class{ProbMatrixCellTypes}}
#' object contains a matrix that determines the proportion of the different cell
#' types that will compose the simulated pseudo-bulk samples. In addition, this
#' object also contains other information relevant for the process. This
#' function does not simulate pseudo-bulk samples, this task is performed by the
#' \code{\link{simBulkProfiles}} or \code{\link{trainDigitalDLSorterModel}}
#' functions (see Documentation).
#'
#' First, the available single-cell profiles are split into training and test
#' subsets (2/3 for training and 1/3 for test by default (see
#' \code{train.freq.cells})) to avoid falsifying the results during model
#' evaluation. Next, \code{num.bulk.samples} bulk samples proportions are built
#' and the single-cell profiles to be used to simulate each pseudo-bulk RNA-Seq
#' sample are set, being 100 cells per bulk sample by default (see
#' \code{n.cells} argument). The proportions of training and test pseudo-bulk
#' samples are set by \code{train.freq.bulk} (2/3 for training and 1/3 for
#' testing by default). Finally, in order to avoid biases due to the composition
#' of the pseudo-bulk RNA-Seq samples, cell type proportions (\eqn{w_1,...,w_k},
#' where \eqn{k} is the number of cell types available in single-cell profiles)
#' are randomly generated by using six different approaches:
#'
#' \enumerate{ \item Cell proportions are randomly sampled from a truncated
#' uniform distribution with predefined limits according to a priori knowledge
#' of the abundance of each cell type (see \code{prob.design} argument). This
#' information can be inferred from the single-cell experiment itself or from
#' the literature. \item A second set is generated by randomly permuting cell
#' type labels from a distribution generated by the previous method. \item Cell
#' proportions are randomly sampled as by method 1 without replacement. \item
#' Using the last method for generating proportions, cell types labels are
#' randomly sampled. \item Cell proportions are randomly sampled from a
#' Dirichlet distribution. \item Pseudo-bulk RNA-Seq samples composed of the
#' same cell type are generated in order to provide 'pure' pseudo-bulk samples.}
#'
#' If you want to inspect the distribution of cell type proportions generated by
#' each method during the process, they can be visualized by the
#' \code{\link{showProbPlot}} function (see Documentation).
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with
#' \code{single.cell.real} slot and, optionally, with \code{single.cell.simul}
#' slot.
#' @param cell.ID.column Name or column number corresponding to the cell names
#' of expression matrix in cells metadata.
#' @param cell.type.column Name or column number corresponding to the cell type
#' of each cell in cells metadata.
#' @param prob.design Data frame with the expected frequency ranges for each
#' cell type present in the experiment. This information can be estimated from
#' literature or from the single-cell experiment itself. This data frame must
#' be constructed by three columns with specific headings (see examples):
#' \itemize{ \item A cell type column with the same name of the cell type
#' column in cells metadata (\code{cell.type.column}). If the name of the
#' column is not the same, the function will return an error. All cell types
#' must appear in the cells metadata. \item A second column called
#' \code{'from'} with the start frequency for each cell type. \item A third
#' column called \code{'to'} with the ending frequency for each cell type.}
#' @param num.bulk.samples Number of bulk RNA-Seq sample proportions (and thus
#' simulated bulk RNA-Seq samples) to be generated taking into account
#' training and test data. We recommend seting this value according to the
#' number of single-cell profiles available in
#' \code{\linkS4class{DigitalDLSorter}} object avoiding an excesive
#' re-sampling, but generating a large number of samples for better training.
#' @param n.cells Number of cells that will be aggregated in order to simulate
#' one bulk RNA-Seq sample (100 by default).
#' @param train.freq.cells Proportion of cells used to simulate training
#' pseudo-bulk samples (2/3 by default).
#' @param train.freq.bulk Proportion of bulk RNA-Seq samples to the total number
#' (\code{num.bulk.samples}) used for the training set (2/3 by default).
#' @param proportions.train Vector of six integers that determines the
#' proportions of bulk samples generated by the different methods (see Details
#' and Torroja and Sanchez-Cabo, 2019. for more information). This vector
#' represents proportions, so its entries must add up 100. By default, a
#' majority of random samples will be generated without using predefined
#' ranges.
#' @param proportions.test \code{proportions.train} for test samples.
#' @param prob.zero Probability of producing cell type proportions equal to
#' zero. It is a vector of six elements corresponding to the six methods of
#' producing cell type proportions (see \code{proportions.train} for more
#' details).
#' @param balanced.type.cells Boolean indicating whether the training and test
#' cells will be split in a balanced way considering the cell types
#' (\code{FALSE} by default).
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#' default).
#'
#' @return A \code{\linkS4class{DigitalDLSorter}} object with
#' \code{prob.cell.types} slot containing a \code{list} with two
#' \code{\linkS4class{ProbMatrixCellTypes}} objects (training and test). For
#' more information about the structure of this class, see
#' \code{?\linkS4class{ProbMatrixCellTypes}}.
#'
#' @export
#'
#' @seealso \code{\link{simBulkProfiles}}
#' \code{\linkS4class{ProbMatrixCellTypes}}
#'
#' @examples
#' set.seed(123) # reproducibility
#' # simulated data
#' sce <- SingleCellExperiment::SingleCellExperiment(
#' assays = list(
#' counts = matrix(
#' rpois(30, lambda = 5), nrow = 15, ncol = 10,
#' dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#' )
#' ),
#' colData = data.frame(
#' Cell_ID = paste0("RHC", seq(10)),
#' Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10,
#' replace = TRUE)
#' ),
#' rowData = data.frame(
#' Gene_ID = paste0("Gene", seq(15))
#' )
#' )
#' DDLS <- loadSCProfiles(
#' single.cell.data = sce,
#' cell.ID.column = "Cell_ID",
#' gene.ID.column = "Gene_ID"
#' )
#' probMatrixValid <- data.frame(
#' Cell_Type = paste0("CellType", seq(2)),
#' from = c(1, 30),
#' to = c(15, 70)
#' )
#' DDLS <- generateBulkCellMatrix(
#' object = DDLS,
#' cell.ID.column = "Cell_ID",
#' cell.type.column = "Cell_Type",
#' prob.design = probMatrixValid,
#' num.bulk.samples = 10,
#' verbose = TRUE
#' )
#'
#' @references Torroja, C. and Sánchez-Cabo, F. (2019). digitalDLSorter: A Deep
#' Learning algorithm to quantify immune cell populations based on scRNA-Seq
#' data. Frontiers in Genetics 10, 978. doi: \doi{10.3389/fgene.2019.00978}
#'
generateBulkCellMatrix <- function(
object,
cell.ID.column,
cell.type.column,
prob.design,
num.bulk.samples,
n.cells = 100,
train.freq.cells = 2/3,
train.freq.bulk = 2/3,
proportions.train = c(10, 5, 20, 15, 35, 15),
proportions.test = c(10, 5, 20, 15, 35, 15),
prob.zero = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
balanced.type.cells = FALSE,
verbose = TRUE
) {
if (!is(object, "DigitalDLSorter")) {
stop("The object provided is not of DigitalDLSorter class")
} else if (is.null(single.cell.real(object))) {
stop("'single.cell.real' slot is empty")
} else if (!train.freq.cells <= 0.95 || !train.freq.cells >= 0.05) {
stop("'train.seq.cells' argument must be less than or equal to 0.95 and ",
"greater than or equal to 0.05")
} else if (!train.freq.bulk <= 0.95 || !train.freq.bulk >= 0.05) {
stop("'train.seq.bulk' argument must be less than or equal to 0.95 and ",
"greater than or equal to 0.05")
} else if (missing(cell.ID.column) || missing(cell.type.column) ||
is.null(cell.ID.column) || is.null(cell.type.column)) {
stop("'cell.ID.column' and 'cell.type.column' arguments are needed. Please, ",
"look at ?estimateZinbwaveParams")
} else if (!is.data.frame(prob.design)) {
stop(paste(
"prob.design must be a data frame with three column names:",
paste("'cell.type.column': must be equal to 'cell.type.column' in",
"cells metadata (colData slot of SingleCellExperiment objects)"),
"'from': frequency from which the cell type can appear",
"'to': frequency up to which the cell type can appear",
sep = "\n - ")
)
} else if (any(proportions.train < 0) || any(proportions.test < 0)) {
stop("Proportions cannot be less than zero")
} else if (sum(proportions.train) != 100 || sum(proportions.test) != 100) {
stop("Proportions provided must add up to 100")
} else if (length(proportions.train) != 6 || length(proportions.test) != 6) {
stop("Proportions must be a vector of 6 elements")
} else if (length(prob.zero) != 6) {
stop("'prob.zero' must be a vector of 6 elements")
} else if (any(prob.zero > 1) || any(prob.zero < 0)) {
stop("'prob.zero' must be a vector whose elements must be between 0 and 1")
} else if (missing(num.bulk.samples) || is.null(num.bulk.samples)) {
stop("'num.bulk.samples' argument must be provided")
}
if (!is.null(prob.cell.types(object)) || !length(prob.cell.types(object)) == 0) {
warning("'prob.cell.types' slot already has probability matrices. ",
"Note that this object will be overwritten\n\n",
call. = FALSE, immediate. = TRUE)
}
if (!all(unlist(lapply(X = list(proportions.train, proportions.test),
FUN = function(x) all(x == floor(x)))))) {
stop("Proportions provided must be composed of integers")
}
if (!is.null(single.cell.simul(object))) {
# extract data from SCE to list
list.metadata <- list(
single.cell.real(object) %>%
SingleCellExperiment::colData(),
single.cell.simul(object) %>%
SingleCellExperiment::colData()
)
# check if cell.type.column and cell.ID.column are correct
lapply(
X = list(list.metadata[[1]], list.metadata[[2]]),
FUN = function(x) {
mapply(
function(y, z) {
.checkColumn(
metadata = x,
ID.column = y,
type.metadata = "cells.metadata",
arg = z
)
},
c(cell.ID.column, cell.type.column),
c("cell.ID.column", "cell.type.column")
)
}
)
list.metadata[[1]]$Simulated <- FALSE
list.metadata[[1]]$suffix <- ""
cells.metadata <- S4Vectors::rbind(list.metadata[[1]], list.metadata[[2]])
} else {
cells.metadata <- single.cell.real(object) %>%
SingleCellExperiment::colData()
}
# check if prob.design is correctly built
lapply(
X = c(cell.type.column, "from", "to"),
FUN = function(x) {
.checkColumn(
metadata = prob.design,
ID.column = x,
type.metadata = "prob.design",
arg = ""
)
}
)
if (any(duplicated(prob.design[, cell.type.column]))) {
stop(paste("'prob.design' must not contain duplicated cell types in",
cell.type.column, "column"))
} else if (!all(prob.design[, cell.type.column] %in%
unique(cells.metadata[, cell.type.column]))) {
stop("There are some cell types in 'prob.design' that do not appear in ",
"cells metadata. Check that the 'prob.design' matrix is correctly built")
} else if (any(prob.design$from < 1) || any(prob.design$from > 99)) {
stop("'from' column in 'prob.design' must be greater than or equal to 1 and ",
"less than or equal to 99")
} else if (any(prob.design$to <= 1) || any(prob.design$to > 100)) {
stop("'to' column in 'prob.design' must be greater than or equal to 1 and ",
"less than or equal to 100")
} else if (any(prob.design$from > prob.design$to)) {
stop("'from' entries must be less than 'to' entries")
} else if (any(abs(prob.design$from) + abs(prob.design$to) > 100)) {
stop("The sum between the 'from' and 'to' entries must not be greater than",
" 100")
}
# check if n.cells is invalid
if (n.cells <= 0) {
stop("'n.cells' must be greater than zero")
} else if (n.cells < length(unique(cells.metadata[, cell.type.column]))) {
stop("'n.cells' must be equal to or greater than the number of cell types in",
" experiment. We recommend more than 100 cells per bulk sample")
}
# check proportions --> avoid num.bulk.samples too low
total.train <- ceiling(num.bulk.samples * train.freq.bulk)
total.test <- num.bulk.samples - total.train
if (total.test == 0)
stop("'num.bulk.samples' too low in relation to 'train.freq.bulk'")
nums.train <- .setHundredLimit(
ceiling((total.train * proportions.train) / 100),
limit = total.train
)
nums.test <- .setHundredLimit(
ceiling((total.test * proportions.test) / 100),
limit = total.test
)
if (verbose) {
message(paste("\n=== The number of bulk RNA-Seq samples that will be generated",
"is equal to", num.bulk.samples))
}
# split data into training and test sets
cells <- cells.metadata[, cell.ID.column]
names(cells) <- cells.metadata[, cell.type.column]
# train set:
# balanced.type.cells == TRUE --> same number of cells by cell types
# balanced.type.cells == FALSE --> completely random
if (!balanced.type.cells) {
train.set <- sample(cells, size = round(nrow(cells.metadata) * train.freq.cells))
# check if we have all cell types in train and test data
test.set.tmp <- cells[!cells %in% train.set]
if (length(unique(names(train.set))) != length(unique(names(cells))) ||
length(unique(names(test.set.tmp))) != length(unique(names(cells)))) {
train.set <- lapply(
X = as.list(unique(names(cells))),
FUN = function(x, cells, train.freq.cells) {
sample(
cells[names(cells) == x],
size = round(length(cells[names(cells) == x]) * train.freq.cells)
)
}, cells = cells, train.freq.cells
) %>% unlist()
}
} else {
train.set <- lapply(
X = as.list(unique(names(cells))),
FUN = function(x, cells, train.freq.cells) {
sample(
cells[names(cells) == x],
size = round(length(cells[names(cells) == x]) * train.freq.cells)
)
}, cells = cells, train.freq.cells
) %>% unlist()
}
train.types <- names(train.set)
train.set.list <- list()
# sort cell types in order to speed up reading times in HDF5 files -->
# same order as are stored simulated cells in HDF5 file (unnecessary)
if (!is.null(zinb.params(object))) {
model.cell.types <- grep(pattern = cell.type.column,
x = colnames(zinb.params(object)@zinbwave.model@X),
value = T)
cell.type.names <- sub(pattern = cell.type.column,
replacement = "",
x = model.cell.types)
if (any(levels(factor(train.types)) %in% cell.type.names)) {
cell.type.names <- c(
cell.type.names,
setdiff(levels(factor(train.types)), cell.type.names)
)
}
} else {
cell.type.names <- levels(factor(train.types))
}
cell.type.train <- cell.type.names[cell.type.names %in% levels(factor(train.types))]
for (ts in cell.type.train) {
train.set.list[[ts]] <- train.set[train.types == ts]
}
# test set
test.set <- cells[!cells %in% train.set]
test.types <- names(test.set)
test.set.list <- list()
cell.type.test <- cell.type.names[cell.type.names %in% levels(factor(test.types))]
for (ts in cell.type.test) {
test.set.list[[ts]] <- test.set[test.types == ts]
}
# check if we have all the cell types in test data
if (length(unique(names(test.set))) != length(unique(names(cells)))) {
stop(
paste(
"Not all cell types considered in DigitalDLSorter object are in test",
"data. digitalDLSorteR needs to have all cell types in both subsets",
"(training and test). Please, provide a bigger single-cell experiment",
"or consider simulate new single-cell profiles with",
"'estimateZinbwaveParams' and 'simSCProfiles' functions"
)
)
}
# check if all cell types are present in train and test data
if (verbose) {
message("\n=== Training set cells by type:")
tb <- unlist(lapply(train.set.list, length))
message(paste0(" - ", names(tb), ": ", tb, collapse = "\n"), "\n")
message("=== Test set cells by type:")
tb <- unlist(lapply(test.set.list, length))
message(paste0(" - ", names(tb), ": ", tb, collapse = "\n"), "\n")
}
# take prob.design
prob.list <- lapply(
split(prob.design, seq(nrow(prob.design))),
FUN = function(x) {
return(seq(from = x[['from']], to = x[['to']]))
}
)
names(prob.list) <- prob.design[, cell.type.column]
n.cell.types <- length(unique(train.types))
functions.list <- list(
.generateSet1, .generateSet2, .generateSet3,
.generateSet4, .generateSet5, .generateSet6
)
# TRAIN SETS #################################################################
train.prob.matrix <- matrix(
NA_real_, nrow = sum(nums.train), ncol = n.cell.types
)
colnames(train.prob.matrix) <- names(prob.list)
train.plots <- list()
n <- 1
loc <- c(0, cumsum(nums.train))
for (fun in functions.list) {
if (nums.train[n] == 0) {
n <- n + 1
next
}
index.ex <- .generateExcludedTypes(
num = nums.train[n],
s.cells = total.train,
n.cell.types = n.cell.types,
prob.zero = prob.zero[n]
)
train.probs <- fun(
prob.list = prob.list,
num = nums.train[n],
s.cells = total.train,
n.cell.types = n.cell.types,
index.ex = index.ex
)
train.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- train.probs
train.plots[[n]] <- .plotsQCSets(
probs = train.probs,
prob.matrix = train.prob.matrix,
n = n,
set = "train"
)
n <- n + 1
}
# check if matrix is correctly built
if (is.null(dim(train.prob.matrix)))
train.prob.matrix <- matrix(train.prob.matrix, nrow = 1)
if (is.null(colnames(train.prob.matrix)))
colnames(train.prob.matrix) <- prob.design[, cell.type.column]
rownames(train.prob.matrix) <- paste("Bulk", seq(dim(train.prob.matrix)[1]),
sep = "_")
train.plots <- train.plots[!unlist(lapply(train.plots, is.null))]
train.method <- unlist(
mapply(
FUN = function(name.method, num.samp) rep(name.method, num.samp),
name.method = paste("Method", seq(1, length(nums.train))),
num.samp = nums.train
)
)
names(train.method) <- paste0("Bulk_", seq(sum(nums.train)))
if (verbose) {
message("=== Probability matrix for training data:")
message(paste(c(" - Bulk RNA-Seq samples:", " - Cell types:"),
dim(train.prob.matrix),
collapse = "\n"), "\n")
}
# TEST SETS ##################################################################
test.prob.matrix <-matrix(NA_real_, nrow = sum(nums.test), ncol = n.cell.types)
test.plots <- list()
n <- 1
# hardcoded because the last function cannot received prob.zero > 0
prob.zero[length(prob.zero)] <- 0
loc <- c(0, cumsum(nums.test))
for (fun in functions.list) {
if (nums.test[n] == 0) {
n <- n + 1
next
}
index.ex <- .generateExcludedTypes(
num = nums.test[n],
s.cells = total.test,
n.cell.types = n.cell.types,
prob.zero = prob.zero[n]
)
test.probs <- fun(
prob.list = prob.list,
num = nums.test[n],
s.cells = total.test,
n.cell.types = n.cell.types,
index.ex = index.ex
)
test.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- test.probs
test.plots[[n]] <- .plotsQCSets(
probs = test.probs,
prob.matrix = test.prob.matrix,
n = n,
set = "test"
)
n <- n + 1
}
# check if matrix is correctly built
if (is.null(dim(test.prob.matrix)))
test.prob.matrix <- matrix(test.prob.matrix, nrow = 1)
if (is.null(colnames(test.prob.matrix)))
colnames(test.prob.matrix) <- prob.design[, cell.type.column]
rownames(test.prob.matrix) <- paste("Bulk", seq(dim(test.prob.matrix)[1]),
sep = "_")
test.plots <- test.plots[!unlist(lapply(test.plots, is.null))]
test.method <- unlist(
mapply(
FUN = function(name.method, num.samp) rep(name.method, num.samp),
name.method = paste("Method", seq(1, length(nums.test))) ,
num.samp = nums.test
)
)
names(test.method) <- paste0("Bulk_", seq(sum(nums.test)))
if (verbose) {
message("=== Probability matrix for test data:")
message(paste(c(" - Bulk RNA-Seq samples:", " - Cell types:"),
dim(test.prob.matrix),
collapse = "\n"), "\n")
}
# GENERATE PROBS MATRIX NAMES
train.prob.matrix.names <- t(apply(
X = train.prob.matrix,
MARGIN = 1,
FUN = setCount,
setList = train.set.list,
sn = colnames(train.prob.matrix),
n.cells = n.cells
))
test.prob.matrix.names <- t(apply(
X = test.prob.matrix,
MARGIN = 1,
FUN = setCount,
setList = test.set.list,
sn = colnames(test.prob.matrix),
n.cells = n.cells
))
# generate object of ProbMatrixCellTypes class
train.prob.matrix.object <- new(
Class = "ProbMatrixCellTypes",
prob.matrix = train.prob.matrix,
cell.names = train.prob.matrix.names,
set.list = train.set.list,
set = train.set,
method = train.method,
plots = train.plots,
type.data = "train"
)
test.prob.matrix.object <- new(
Class = "ProbMatrixCellTypes",
prob.matrix = test.prob.matrix,
cell.names = test.prob.matrix.names,
set.list = test.set.list,
set = test.set,
method = test.method,
plots = test.plots,
type.data = "test"
)
prob.cell.types(object) <- list(
train = train.prob.matrix.object,
test = test.prob.matrix.object
)
if (verbose) message("DONE")
return(object)
}
.violinPlot <- function(
df,
title,
x = "CellType",
y = "Prob"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
geom_violin() + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.boxPlot <- function(
df,
title,
x = "CellType",
y = "Prob"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
geom_boxplot() + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.linesPlot <- function(
df,
title,
x = "CellType",
y = "Prob",
group = "Sample"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]],
group = .data[[group]])) +
geom_line(colour = "grey60") + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.plotsQCSets <- function(
probs,
prob.matrix,
n,
set
) {
title <- paste0("Bulk Probability Dist. Set ", n, " (", set, ")")
n.samples <- paste("# samples:", dim(probs)[1])
plots.functions <- list(.violinPlot, .boxPlot, .linesPlot)
df <- reshape2::melt(probs)
colnames(df) <- c("Sample", "CellType", "Prob")
# first three plots
plot.list <- lapply(plots.functions, function(f) f(df, paste(title, n.samples)))
# final plots
dummy <- t(apply(as.matrix(stats::na.omit(prob.matrix)), 1, sort, decreasing = T))
df <- reshape2::melt(dummy)
colnames(df) <- c("Sample", "nCellTypes", "Prob")
df$nCellTypes <- factor(df$nCellTypes)
plot.list[[4]] <- .boxPlot(df = df, x = "nCellTypes", title = title)
names(plot.list) <- c("violinplot", "boxplot", "linesplot", "ncelltypes")
return(plot.list)
}
# there is something wrong
setCount <- function(
x,
setList,
sn,
n.cells
) {
names(x) <- sn
sc <- c()
x.set <- .setHundredLimit(x = (x * n.cells) / 100, limit = n.cells)
for (cType in names(x)) {
n <- ceiling(x.set[cType]) # n <- ceiling(x[cType])
if (n > 0) {
if (is.null(setList[[cType]]))
next
repl <- ifelse(test = n > length(setList[[cType]]), yes = TRUE, no = FALSE)
sc <- c(sc, sample(setList[[cType]], size = n, replace = repl))
}
}
return(sc[seq(n.cells)])
}
# introduce zeros in selected proportions. when these samples will be
# simulated, they will have zero values in these cell types
.cellExcluder <- function(vec, index.ex) {
# sel <- sample(x = index.ex, size = length(index.ex)) # -1
vec[index.ex] <- 0
return(list(vec, index.ex))
}
# recursive implementation
# this function take 1 cell type from the available cell types (without
# using index.ex cell types) and add up or subtract depending what is needed
.setHundredLimit <- function(
x,
index.ex = NULL,
limit = 100
) {
if (sum(x) > limit) {
while (TRUE) {
if (is.null(index.ex)) {
sel <- sample(x = seq(length(x)), size = 1)
} else {
if (length(index.ex) != length(x) - 1) {
sel <- sample(x = which(x != 0), size = 1)
} else {
sel <- sample(x = seq(length(x))[-index.ex], size = 1)
}
}
res <- x[sel] - abs(sum(x) - limit)
if (res < 0) res <- x[sel] - sample(x = x[sel], size = 1)
if (res >= 0) break
}
x[sel] <- res
} else if (sum(x) < limit) {
while (TRUE) {
if (is.null(index.ex)) {
sel <- sample(x = seq(length(x)), size = 1)
} else {
sel <- sample(x = seq(length(x))[-index.ex], size = 1)
}
res <- x[sel] + abs(sum(x) - limit)
if (res <= limit) break
}
x[sel] <- res
}
if (sum(x) != limit)
return(.setHundredLimit(x = x, index.ex = index.ex, limit = limit))
else
return(x)
}
# This function is not intended to deal with index.ex, so I'm going to avoid
# it. Proportions will be corrected by .setHundredLimit. The idea is to
# distribute the proportions equally between the different cell types
.adjustHundred <- function(
x,
prob.list,
index.ex = NULL,
sampling = TRUE
) {
w <- unlist(lapply(prob.list, sample, 1))
# remove cell types if needed
if (!is.null(index.ex)) {
x.list <- .cellExcluder(vec = x, index.ex = index.ex)
x <- x.list[[1]]
# w[x.list[[2]]] <- 0
# # this sentence is in order to avoid that there is only one sample when they
# # should be two
# if (any(w[-x.list[[2]]] == 0)) {
# no.zero <- setdiff(seq_along(w), x.list)
# w[no.zero[w[no.zero] == 0]] <- 1
# }
}
d <- abs(sum(x) - 100)
if (sum(x) > 100) {
div.w <- (w / sum(w)) * d
while (!all(x >= div.w)) {
index <- which(!x >= div.w)
w[index] <- 0
div.w <- (w / sum(w)) * d
}
x <- round(x - div.w)
x <- .setHundredLimit(x = x, index.ex = index.ex)
} else if (sum(x) < 100) {
div.w <- (w / sum(w)) * d
while (!all(100 - div.w > x)) {
index <- which(!100 - div.w > x)
w[index] <- 0
div.w <- (w / sum(w)) * d
}
x <- round(x + div.w)
x <- .setHundredLimit(x = x, index.ex = index.ex)
}
return(x)
}
# generate index.ex variable for each sample using prob.zero
.generateExcludedTypes <- function(
num,
s.cells,
n.cell.types,
prob.zero
) {
if (prob.zero > 1 || prob.zero < 0)
stop("'prob.zero' must be an integer between 0 and 1")
prob.zero <- 1 - prob.zero
# number of cell types equal to zero have the same probability: only changes
# the probability of having 0 zeros. Maybe this could be improvable
random.zeros <- function(prob.zero) {
num.zero <- sample(
seq(0, n.cell.types - 1), size = 1,
prob = c(prob.zero, rep((1 - prob.zero) / (n.cell.types - 1), n.cell.types - 1))
)
# num.zero <- sample(x = seq(1, n.cell.types - 1), size = 1)
if (num.zero == 0) {
return(NULL)
} else {
res <- sample(seq(n.cell.types), size = num.zero)
if (length(res) == n.cell.types)
res <- res[-sample(x = n.cell.types, size = 1)]
}
return(res)
}
return(
replicate(
n = num,
expr = random.zeros(prob.zero = prob.zero)
)
)
}
# cell proportions are randomly sampled from a truncated uniform distribution
# with predefined limits according to the a priori knowledge of the abundance
# of each cell type
.generateSet1 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
sampling <- function(prob.list, ex.cells) {
return(
.cellExcluder(
vec = unlist(lapply(X = prob.list, FUN = sample, 1)),
index.ex = ex.cells
)[[1]]
)
}
# preallocate matrix in order to avoid memory problems
prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
c <- 1
while (c <= num) {
prob.matrix[c,] <- sampling(prob.list = prob.list, ex.cells = index.ex[[c]])
c <- c + 1
}
prob.matrix <- round(prob.matrix * 100 / rowSums(prob.matrix))
prob.matrix <- do.call(
what = rbind,
args = lapply(
X = seq(nrow(prob.matrix)),
FUN = function(x) {
.setHundredLimit(x = prob.matrix[x, ], index.ex = index.ex[[x]])
}
)
)
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
# generated by randomly permuting cell type labels on the previous proportions
# (.generateSet1)
.generateSet2 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
# First, I correct proportions: cell types with zero are not going to be
# changed, but in the next line I resample labels
prob.matrix <- .generateSet1(
prob.list = prob.list,
num = num,
s.cells = s.cells,
n.cell.types = n.cell.types,
index.ex = index.ex
)
prob.matrix <- t(apply(X = prob.matrix, MARGIN = 1, FUN = sample))
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
.generateSet3 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
# preallocate matrix in order to avoid memory problems
prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
c <- 1
while (c <= num) {
p <- rep(0, n.cell.types)
# To avoid a bias with first cell types, I use sample
i <- sample(
x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]),
size = 1
)
while (sum(p) < 100) {
p[i] <- p[i] + sample(x = seq((100 - sum(p))), size = 1) # / n.cell.types
i <- sample(
x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]), # c(index.ex[[c]], i)
size = 1
)
}
p <- round(p * 100 / sum(p))
p <- .adjustHundred(x = p, prob.list = prob.list, index.ex = index.ex[[c]])
prob.matrix[c, ] <- p
# counter
c <- c + 1
}
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
.generateSet4 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
c <- 1
while(c <= num) {
p <- rep(0, n.cell.types)
names(p) <- names(prob.list)
i <- sample(
x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]),
size = 1
)
while (sum(p) < 100) {
dp <- 101
while (dp > max(prob.list[[i]])) {
dp <- sample(x = prob.list[[i]], size = 1)
}
p[i] <- p[i] + dp
i <- sample(
x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]),
size = 1
)
}
prob.matrix[c, ] <- p
c <- c + 1
}
prob.matrix <- round(prob.matrix * 100 / rowSums(prob.matrix))
prob.matrix <- do.call(
what = rbind,
args = lapply(
X = seq(nrow(prob.matrix)),
FUN = function(x) {
p <- .adjustHundred(
x = prob.matrix[x, ], prob.list = prob.list, index.ex = index.ex[[x]]
)
return(sample(p))
}
)
)
# p <- .adjustHundred(x = p, prob.list = prob.list, index.ex = index.ex[[c]])
# p <- sample(p)
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
.generateSet5 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
prob.matrix <- do.call(
what = rbind,
args = lapply(
X = seq(num),
FUN = function(x) {
p <- gtools::rdirichlet(
n = 1,
alpha = .cellExcluder(
rep(1, n.cell.types), index.ex = index.ex[[x]]
)[[1]]
)
p <- round(p * 100 / sum(p))
return(.setHundredLimit(x = p, index.ex = index.ex[[x]]))
}
)
)
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
# generate pseudo-bulk samples consisted of only 1 cell type
.generateSet6 <- function(
prob.list,
num,
s.cells,
n.cell.types,
index.ex
) {
prob.matrix <- matrix(0, nrow = num, ncol = n.cell.types)
num.set <- .setHundredLimit(
x = rep(ceiling(num / n.cell.types), n.cell.types), limit = num
)
num.index <- c(0, cumsum(num.set))
# which(num.set != 0) because may be cell types without pseudo-bulk --> error
for (i in which(num.set != 0)) {
prob.matrix[seq(num.index[i] + 1, num.index[i + 1]), i] <- 100
}
colnames(prob.matrix) <- names(prob.list)
return(prob.matrix)
}
################################################################################
######################## Simulate bulk RNA-Seq samples #########################
################################################################################
#' Simulate training and test pseudo-bulk RNA-Seq profiles
#'
#' Simulate training and test pseudo-bulk RNA-Seq profiles using the cell
#' composition matrices generated by the \code{\link{generateBulkCellMatrix}}
#' function. The samples are generated under the assumption that the expression
#' level of the \eqn{i} gene in the \eqn{j} bulk sample is given by the sum of
#' the expression levels of the cell types \eqn{X_{ijk}} that make them up
#' weighted by the proportions of these \eqn{k} cell types in each sample. In
#' practice, as described in Torroja and Sanchez-Cabo, 2019, these profiles are
#' generated by summing a number of cells of different cell types determined by
#' proportions from a matrix of known cell composition. The number of simulated
#' pseudo-bulk RNA-Seq samples and the number of cells composing each sample are
#' determined by \code{\link{generateBulkCellMatrix}} (see Documentation)
#' \strong{Note:} this step can be avoided by using the \code{on.the.fly}
#' argument in the \code{\link{trainDigitalDLSorterModel}} function. See
#' Documentation for more information.
#'
#' \pkg{digitalDLSorteR} allows the use of HDF5 files as back-end to store the
#' resulting data using the \pkg{DelayedArray} and \pkg{HDF5Array} packages.
#' This functionality allows to work without keeping the data loaded into RAM,
#' which could be of vital importance during some computationally heavy steps
#' such as neural network training on RAM-limited machines. You must provide a
#' valid file path in the \code{file.backend} argument to store the resulting
#' file with the '.h5' extension. The data will be accessible from R without
#' being loaded into memory. This option slightly slows down execution times, as
#' subsequent transformations of the data will be done in blocks rather than
#' using all the data. We recommend this option according to the computational
#' resources available and the number of pseudo-bulk samples to be generated.
#'
#' Note that if you use the \code{file.backend} argument with
#' \code{block.processing = FALSE}, all pseudo-bulk profiles will be simulated
#' in one step and, therefore, loaded into RAM. Then, the data will be written
#' to an HDF5 file. To avoid the RAM collapse, pseudo-bulk profiles can be
#' simulated and written to HDF5 files in blocks of \code{block.size} size by
#' setting \code{block.processing = TRUE}.
#'
#' It is possible to avoid this step by using the \code{on.the.fly} argument in
#' the \code{\link{trainDigitalDLSorterModel}} function. In this way, data is
#' generated 'on the fly' during the neural network training. For more details,
#' see \code{?\link{trainDigitalDLSorterModel}}.
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with
#' \code{single.cell.real}/\code{single.cell.simul} and \code{prob.cell.types}
#' slots.
#' @param type.data Type of data to generate between \code{'train'},
#' \code{'test'} or \code{'both'} (the last by default).
#' @param pseudobulk.function Function used to build pseudo-bulk samples. It may
#' be: \itemize{ \item \code{"MeanCPM"}: single-cell profiles (raw counts) are
#' transformed into CPMs and cross-cell averages are calculated. Then,
#' \code{log2(CPM + 1)} is calculated. \item \code{"AddCPM"}: single-cell
#' profiles (raw counts) are transformed into CPMs and are added up across
#' cells. Then, log-CPMs are calculated. \item \code{"AddRawCount"}:
#' single-cell profiles (raw counts) are added up across cells. Then, log-CPMs
#' are calculated.}
#' @param file.backend Valid file path to store the simulated single-cell
#' expression profiles as an HDF5 file (\code{NULL} by default). If provided,
#' the data is stored in HDF5 files used as back-end by using the
#' \pkg{DelayedArray}, \pkg{HDF5Array} and \pkg{rhdf5} packages instead of
#' loading all data into RAM memory. This is suitable for situations where you
#' have large amounts of data that cannot be loaded into memory. Note that
#' operations on this data will be performed in blocks (i.e subsets of
#' determined size) which may result in longer execution times.
#' @param compression.level The compression level used if \code{file.backend} is
#' provided. It is an integer value between 0 (no compression) and 9 (highest
#' and slowest compression). See
#' \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} from the
#' \pkg{HDF5Array} package for more information.
#' @param block.processing Boolean indicating whether the data should be
#' simulated in blocks (only if \code{file.backend} is used, \code{FALSE} by
#' default). This functionality is suitable for cases where is not possible to
#' load all data into memory and it leads to larger execution times.
#' @param block.size Only if \code{block.processing = TRUE}. Number of
#' pseudo-bulk expression profiles that will be simulated in each iteration
#' during the process. Larger numbers result in higher memory usage but
#' shorter execution times. Set according to available computational resources
#' (1000 by default).
#' @param chunk.dims Specifies the dimensions that HDF5 chunk will have. If
#' \code{NULL}, the default value is a vector of two items: the number of
#' genes considered by \code{\linkS4class{DigitalDLSorter}} object during the
#' simulation, and a single sample to reduce read times in the following
#' steps. A larger number of columns written in each chunk can lead to longer
#' read times.
#' @param threads Number of threads used during the simulation of pseudo-bulk
#' samples (1 by default). Set according to computational resources and avoid
#' it if \code{block.size} will be used.
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#' default).
#'
#' @return A \code{\linkS4class{DigitalDLSorter}} object with \code{bulk.simul}
#' slot containing a list with one or two entries (depending on selected
#' \code{type.data} argument): \code{'train'} and \code{'test'}. Each entry
#' contains a \code{\link[SummarizedExperiment]{SummarizedExperiment}} object
#' with simulated bulk samples in the \code{assay} slot, sample names in the
#' \code{colData} slot and feature names in the \code{rowData} slot.
#'
#' @export
#'
#' @seealso \code{\link{generateBulkCellMatrix}}
#' \code{\linkS4class{ProbMatrixCellTypes}}
#' \code{\link{trainDigitalDLSorterModel}}
#'
#' @examples
#' set.seed(123) # reproducibility
#' # simulated data
#' sce <- SingleCellExperiment::SingleCellExperiment(
#' assays = list(
#' counts = matrix(
#' rpois(30, lambda = 5), nrow = 15, ncol = 10,
#' dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#' )
#' ),
#' colData = data.frame(
#' Cell_ID = paste0("RHC", seq(10)),
#' Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10,
#' replace = TRUE)
#' ),
#' rowData = data.frame(
#' Gene_ID = paste0("Gene", seq(15))
#' )
#' )
#' DDLS <- loadSCProfiles(
#' single.cell.data = sce,
#' cell.ID.column = "Cell_ID",
#' gene.ID.column = "Gene_ID"
#' )
#' probMatrixValid <- data.frame(
#' Cell_Type = paste0("CellType", seq(2)),
#' from = c(1, 30),
#' to = c(15, 70)
#' )
#' DDLS <- generateBulkCellMatrix(
#' object = DDLS,
#' cell.ID.column = "Cell_ID",
#' cell.type.column = "Cell_Type",
#' prob.design = probMatrixValid,
#' num.bulk.samples = 10,
#' verbose = TRUE
#' )
#' DDLS <- simBulkProfiles(DDLS, verbose = TRUE)
#'
#' @references Fischer B, Smith M and Pau, G (2020). rhdf5: R Interface to HDF5.
#' R package version 2.34.0.
#'
#' Pagès H, Hickey P and Lun A (2020). DelayedArray: A unified framework for
#' working transparently with on-disk and in-memory array-like datasets. R
#' package version 0.16.0.
#'
#' Pagès H (2020). HDF5Array: HDF5 backend for DelayedArray objects. R package
#' version 1.18.0.
#'
simBulkProfiles <- function(
object,
type.data = "both",
pseudobulk.function = "MeanCPM",
file.backend = NULL,
compression.level = NULL,
block.processing = FALSE,
block.size = 1000,
chunk.dims = NULL,
threads = 1,
verbose = TRUE
) {
if (!is(object, "DigitalDLSorter")) {
stop("The object provided is not of DigitalDLSorter class")
} else if (is.null(single.cell.real(object))) {
stop("There are no real single-cell profiles in DigitalDLSorter object")
} else if (is.null(prob.cell.types(object))) {
stop("'prob.cell.types' slot is empty")
} else if (!any(type.data == c("train", "test", "both"))) {
stop("'type.data' argument must be one of the following options: 'train', 'test' or 'both'")
}
if (!pseudobulk.function %in% c("MeanCPM", "AddCPM", "AddRawCount")) {
stop("'pseudobulk.function' must be one of the following options: 'MeanCPM', 'AddCPM', 'AddRawCount'")
} else {
if (pseudobulk.function == "MeanCPM") {
.pseudobulk.fun <- pseudobulk.fun.mean.cpm
} else if (pseudobulk.function == "AddCPM") {
.pseudobulk.fun <- pseudobulk.fun.add.cpm
} else if (pseudobulk.function == "AddRawCount") {
.pseudobulk.fun <- pseudobulk.fun.add.raw.counts
}
}
if (!is.null(file.backend)) {
if (!requireNamespace("DelayedArray", quietly = TRUE) ||
!requireNamespace("HDF5Array", quietly = TRUE)) {
stop("digitalDLSorteR provides the possibility of using HDF5 files as back-end
when data are too big to be located in RAM. It uses DelayedArray,
HDF5Array and rhdf5 to do it. Please install both packages to
use this functionality")
}
if (file.exists(file.backend)) {
stop("'file.backend' already exists. Please provide a correct file path")
}
if (is.null(compression.level)) {
compression.level <- HDF5Array::getHDF5DumpCompressionLevel()
} else {
if (compression.level < 0 || compression.level > 9) {
stop("'compression.level' must be an integer between 0 (no compression) ",
"and 9 (highest and slowest compression). ")
}
}
}
if (threads <= 0) threads <- 1
if (verbose) {
message(paste("=== Setting parallel environment to", threads, "thread(s)"))
}
if (type.data == "both") {
if (!is.null(object@bulk.simul)) {
warning("'bulk.simul' slot will be overwritten\n\n",
call. = FALSE, immediate. = TRUE)
}
bulk.counts <- lapply(
X = c("train", "test"),
FUN = function(x) {
if (verbose) {
message(paste("\n=== Generating", x, "bulk samples:"))
}
.generateBulkProfiles(
object = object,
type.data = x,
fun.pseudobulk = .pseudobulk.fun,
unit = pseudobulk.function,
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
block.size = block.size,
chunk.dims = chunk.dims,
threads = threads,
verbose = verbose
)
}
)
names(bulk.counts) <- c("train", "test")
object@bulk.simul <- bulk.counts
} else {
if (!is.null(bulk.simul(object)) && type.data %in% names(bulk.simul(object))) {
warning(paste(type.data, "data in 'bulk.simul' slot will be overwritten",
"\n\n"),
call. = FALSE, immediate. = TRUE)
}
if (verbose) {
message(paste("\n=== Generating", type.data, "bulk samples:"))
}
bulk.counts <- .generateBulkProfiles(
object = object,
type.data = type.data,
fun.pseudobulk = .pseudobulk.fun,
unit = pseudobulk.function,
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
block.size = block.size,
chunk.dims = chunk.dims,
threads = threads,
verbose = verbose
)
if (!is.null(bulk.simul(object))) {
if (type.data %in% names(bulk.simul(object))) {
bulk.simul(object, type.data) <- NULL
}
bulk.simul(object) <- c(bulk.simul(object), type.data = bulk.counts)
} else {
bulk.simul(object) <- list(bulk.counts)
names(bulk.simul(object)) <- type.data
}
}
if (verbose) message("\nDONE")
return(object)
}
.generateBulkProfiles <- function(
object,
type.data,
fun.pseudobulk,
unit,
file.backend,
compression.level,
block.processing,
block.size,
chunk.dims,
threads,
verbose
) {
sel.bulk.cells <- prob.cell.types(object, type.data) %>% cell.names()
sel.bulk.cells <- sel.bulk.cells[sample(nrow(sel.bulk.cells)), ]
if (!is.null(single.cell.simul(object))) {
suffix.names <- unique(colData(single.cell.simul(object))$suffix)
} else {
suffix.names <- "_Simul"
}
pattern <- suffix.names
if (!verbose) pbo <- pbapply::pboptions(type = "none")
if (block.processing && is.null(file.backend)) {
stop("'block.processing' is only compatible to the use of HDF5 files ",
"as back-end ('file.backend' argument)")
} else if (block.processing && !is.null(file.backend)) {
n <- nrow(sel.bulk.cells)
J <- nrow(assay(single.cell.real(object)))
if (n < block.size) {
block.size <- n
warning("The number of simulated samples is less than 'block.size'. ",
"A single block will be performed.",
call. = FALSE, immediate. = TRUE)
}
if (is.null(chunk.dims) || length(chunk.dims) != 2) chunk.dims <- c(J, 1)
if (!file.exists(file.backend)) rhdf5::h5createFile(file.backend)
rhdf5::h5createDataset(
file.backend, type.data,
dims = c(J, block.size),
maxdims = c(J, n),
chunk = chunk.dims,
storage.mode = "double"
)
r.i <- 0
# iteration over cells
for (iter in seq(ceiling(n / block.size))) {
if (verbose) message(paste(" - Writing block", iter))
if ((block.size * iter) - n > 0) {
dif <- block.size
dif.2 <- (block.size * iter) - n
block.size <- block.size - dif.2
} else {
dif <- block.size
}
sub.i <- seq(from = r.i + 1, to = r.i + block.size)
r.i <- r.i + block.size
bulk.samples <- pbapply::pbapply(
X = sel.bulk.cells[sub.i, , drop = FALSE],
MARGIN = 1,
FUN = .setBulk,
object = object,
pattern = pattern,
fun.pseudobulk = fun.pseudobulk,
cl = threads
)
if (iter == 1) {
rhdf5::h5write(
obj = bulk.samples, file = file.backend,
name = type.data, level = compression.level
)
} else {
# check number of cells in the next loop
rhdf5::h5set_extent(file.backend, type.data, dims = c(J, n))
rhdf5::h5write(
obj = bulk.samples,
file = file.backend, name = type.data,
index = list(seq(J), seq((dif * (iter - 1)) + 1,
(dif * (iter - 1)) + ncol(bulk.samples))),
level = compression.level
)
}
}
rhdf5::H5close()
# HDF5Array object for SingleCellExperiment class
bulk.samples <- HDF5Array::HDF5Array(file = file.backend, name = type.data)
dimnames(bulk.samples) <- list(rownames(assay(single.cell.real(object))),
rownames(sel.bulk.cells))
} else if (!block.processing) {
bulk.samples <- pbapply::pbapply(
X = sel.bulk.cells,
MARGIN = 1,
FUN = .setBulk,
object = object,
pattern = pattern,
fun.pseudobulk = fun.pseudobulk,
cl = threads
)
}
if (!verbose) on.exit(pbapply::pboptions(pbo))
return(
.createSEObject(
counts = bulk.samples,
samples.metadata = prob.cell.types(object, type.data)@prob.matrix[rownames(sel.bulk.cells), ],
genes.metadata = rownames(assay(single.cell.real(object))),
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
type.data = type.data,
verbose = verbose
)
)
}
.createSEObject <- function(
counts,
samples.metadata,
genes.metadata,
file.backend,
compression.level,
block.processing,
type.data,
verbose
) {
# could be a check of counts class -> if (is(counts, "HDF5Array"))
if (!is.null(file.backend) && !block.processing) {
counts <- .useH5backend(
counts = counts,
file.backend = file.backend,
compression.level = compression.level,
group = type.data,
sparse = FALSE,
verbose = verbose
)
}
return(
SummarizedExperiment::SummarizedExperiment(
assays = list(counts = counts),
colData = samples.metadata,
rowData = genes.metadata
)
)
}
## functions to calculate pseudo-bulk samples
.cpmCalculate <- function(x) {
if (is.null(dim(x))) return((x / sum(x)) * 1e6)
else return(apply(X = x, MARGIN = 2, FUN = function(x) (x / sum(x)) * 1e6))
}
pseudobulk.fun.mean.cpm <- function(x) {
rowMeans(log2(.cpmCalculate(x = x + 1)))
}
pseudobulk.fun.add.cpm <- function(x) {
.cpmCalculate(x = rowSums(log2(.cpmCalculate(x = x + 1))))
}
pseudobulk.fun.add.raw.counts <- function(x) {
log2(.cpmCalculate(x = rowSums(x) + 1))
}
.setBulk <- function(x, object, pattern, fun.pseudobulk) {
sep.b <- grepl(pattern = pattern, x = x)
if (any(sep.b)) {
cols.sim <- match(
x = x[sep.b], table = colnames(single.cell.simul(object))
) %>% sort()
cols.real <- match(
x = x[!sep.b], table = colnames(single.cell.real(object))
) %>% sort()
sim.counts <- as.matrix(assay(single.cell.simul(object))[, cols.sim, drop = FALSE])
real.counts <- as.matrix(assay(single.cell.real(object))[, cols.real, drop = FALSE])
counts <- .mergeMatrices(x = real.counts, y = sim.counts) # merge matrices
} else if (all(sep.b)) {
cols <- sort(match(x = x[sep.b], table = colnames(single.cell.simul(object))))
counts <- as.matrix(assay(single.cell.simul(object))[, cols, drop = FALSE])
} else {
cols <- match(x = x, table = colnames(single.cell.real(object))) %>% sort()
counts <- as.matrix(assay(single.cell.real(object))[, cols, drop = FALSE])
}
return(fun.pseudobulk(x = counts))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.