Nothing
#' @importFrom ggplot2 ggplot aes geom_violin geom_boxplot geom_line theme ggtitle element_text
NULL
################################################################################
####################### Generate cell composition matrix #######################
################################################################################
#' Generate training and test cell type composition matrices
#'
#' Generate training and test cell type composition matrices for the simulation
#' of mixed transcriptional profiles with known cell composition using
#' single-cell expression profiles. The resulting
#' \code{\linkS4class{PropCellTypes}} object will contain all the information
#' needed to simulate new mixed transcriptional profiles. Note this function
#' does not simulate the mixed profiles, this task is performed by the
#' \code{\link{simMixedProfiles}} or \code{\link{trainDeconvModel}} functions
#' (see Documentation).
#'
#' First, the single-cell profiles are randomly divided into two subsets, with
#' 2/3 of the data for training and 1/3 for testing. The default setting for
#' this ratio can be changed using the \code{train.freq.cells} parameter. Next,
#' a total of \code{num.sim.spots} mixed proportions are simulated using a
#' Dirichlet distribution. This simulation takes into account the probability of
#' missing cell types in each spot, which can be adjusted using the
#' \code{prob.sparity} parameter. For each mixed sample, \code{n.cells}
#' single-cell profiles are randomly selected and combined to generate the
#' simulated mixed sample. In addition to the Dirichlet-based proportions, pure
#' spots (containing only one cell type) and spots containing a specified number
#' of different cell types (determined by the \code{min.zero.prop} parameter)
#' are also generated in order to cover situations with only a few cell types
#' present. The proportion of simulated spots generated by each method can be
#' controlled using the \code{proportion.method} parameter. To visualize the
#' distribution of cell type proportions generated by each method, the
#' \code{\link{showProbPlot}} function can be used.
#'
#' @param object \code{\linkS4class{SpatialDDLS}} object with
#' \code{single.cell.real} slot and, optionally, with \code{single.cell.simul}
#' slot.
#' @param cell.ID.column Name or column number corresponding to cell names in
#' cells metadata.
#' @param cell.type.column Name or column number corresponding to cell types in
#' cells metadata.
#' @param num.sim.spots Number of mixed profiles to be simulated. It is
#' recommended to adjust this number according to the number of available
#' single-cell profiles.
#' @param n.cells Specifies the number of cells to be randomly selected and
#' combined to generate the simulated mixed profiles. By default, it is set to
#' 50 It controls the level of noise present in the simulated data, as it
#' determines how many single-cell profiles will be combined to produce each
#' spot.
#' @param train.freq.cells Proportion of cells used to simulate training mixed
#' transcriptional profiles (3/4 by default).
#' @param train.freq.spots Proportion of mixed transcriptional profiles to be
#' used for training, relative to the total number of simulated spots
#' (\code{num.sim.spots}). The default value is 3/4.
#' @param proportion.method Vector with three elements that controls the
#' proportion of simulated proportions generated by each method: random
#' sampling of a Dirichlet distribution, "pure" spots (1 cell type), and spots
#' generated from a random sampling of a Dirichlet distribution but with a
#' specified number of different cell types (determined by
#' \code{min.zero.prop}), respectively. By default, all samples are generated
#' by the last method.
#' @param prob.sparity It only affects the proportions generated by the first
#' method (Dirichlet distribution). It determines the probability of having
#' missing cell types in each simulated spot, as opposed to a mixture of all
#' cell types. A higher value for this parameter will result in more sparse
#' simulated samples.
#' @param min.zero.prop This parameter controls the minimum number of cell types
#' that will be absent in each simulated spot. If \code{NULL} (by default),
#' this value will be half of the total number of different cell types, but
#' increasing it will result in more spots composed of fewer cell types. This
#' helps to create more sparse proportions and cover a wider range of
#' situations during model training.
#' @param balanced.type.cells Boolean indicating whether training and test cells
#' will be split in a balanced way considering cell types (\code{TRUE} by
#' default).
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#' default).
#'
#' @return A \code{\linkS4class{SpatialDDLS}} object with \code{prob.cell.types}
#' slot containing a \code{list} with two \code{\linkS4class{PropCellTypes}}
#' objects (training and test). For more information about the structure of
#' this class, see \code{?\linkS4class{PropCellTypes}}.
#'
#' @export
#'
#' @seealso \code{\link{simMixedProfiles}} \code{\linkS4class{PropCellTypes}}
#'
#' @examples
#' set.seed(123)
#' sce <- SingleCellExperiment::SingleCellExperiment(
#' assays = list(
#' counts = matrix(
#' rpois(100, lambda = 5), nrow = 40, ncol = 30,
#' dimnames = list(paste0("Gene", seq(40)), paste0("RHC", seq(30)))
#' )
#' ),
#' colData = data.frame(
#' Cell_ID = paste0("RHC", seq(30)),
#' Cell_Type = sample(x = paste0("CellType", seq(4)), size = 30,
#' replace = TRUE)
#' ),
#' rowData = data.frame(
#' Gene_ID = paste0("Gene", seq(40))
#' )
#' )
#'
#' SDDLS <- createSpatialDDLSobject(
#' sc.data = sce,
#' sc.cell.ID.column = "Cell_ID",
#' sc.gene.ID.column = "Gene_ID",
#' sc.filt.genes.cluster = FALSE,
#' project = "Simul_example"
#' )
#' SDDLS <- genMixedCellProp(
#' object = SDDLS,
#' cell.ID.column = "Cell_ID",
#' cell.type.column = "Cell_Type",
#' num.sim.spots = 10,
#' train.freq.cells = 2/3,
#' train.freq.spots = 2/3,
#' verbose = TRUE
#' )
#'
genMixedCellProp <- function(
object,
cell.ID.column,
cell.type.column,
num.sim.spots,
n.cells = 50,
train.freq.cells = 3/4,
train.freq.spots = 3/4,
proportion.method = c(0, 0, 1),
prob.sparity = 1,
min.zero.prop = NULL,
balanced.type.cells = TRUE,
verbose = TRUE
) {
if (!is(object, "SpatialDDLS")) {
stop("The object provided is not of SpatialDDLS class")
} else if (is.null(single.cell.real(object))) {
stop("'single.cell.real' slot is empty")
} else if (!train.freq.cells <= 0.95 || !train.freq.cells >= 0.05) {
stop("'train.seq.cells' argument must be less than or equal to 0.95 and ",
"greater than or equal to 0.05")
} else if (!train.freq.spots <= 0.95 || !train.freq.spots >= 0.05) {
stop("'train.freq.spots' argument must be less than or equal to 0.95 and ",
"greater than or equal to 0.05")
} else if (missing(cell.ID.column) || missing(cell.type.column) ||
is.null(cell.ID.column) || is.null(cell.type.column)) {
stop("'cell.ID.column' and 'cell.type.column' arguments are needed. Please, ",
"look at ?estimateZinbwaveParams")
} else if (missing(num.sim.spots) || is.null(num.sim.spots)) {
stop("'num.sim.spots' argument must be provided")
}
if (length(proportion.method) != 3) {
stop("`proportion.method` must be a vector of three elements. See ?genMixedCellProp")
} else if (any(proportion.method < 0)) {
stop("Proportions in `proportion.method cannot be less than zero")
} else if (sum(proportion.method) != 1) {
stop("Proportions in `proportion.method` must add up to 1")
}
if(length(prob.sparity) != 1) {
stop("Only 1 number between 1 and 0 required")
} else if (prob.sparity > 1 || prob.sparity < 0) {
stop("'prob.sparity' must be a number between 0 and 1")
}
if (!is.null(prob.cell.types(object)) || !length(prob.cell.types(object)) == 0) {
warning("'prob.cell.types' slot already has probability matrices. ",
"Note that this object will be overwritten\n\n",
call. = FALSE, immediate. = TRUE)
}
if (!is.null(single.cell.simul(object))) {
# extract data from SCE to list
list.metadata <- list(
single.cell.real(object) %>%
SingleCellExperiment::colData(),
single.cell.simul(object) %>%
SingleCellExperiment::colData()
)
list.metadata[[1]]$Simulated <- FALSE
list.metadata[[1]]$suffix <- ""
cells.metadata <- S4Vectors::rbind(list.metadata[[1]], list.metadata[[2]])
} else {
cells.metadata <- single.cell.real(object) %>%
SingleCellExperiment::colData()
}
## checking columns
mapply(
function(y, z) {
.checkColumn(
metadata = cells.metadata,
ID.column = y,
type.metadata = "cells.metadata",
arg = z
)
},
c(cell.ID.column, cell.type.column),
c("cell.ID.column", "cell.type.column")
)
# check if n.cells is invalid
if (n.cells <= 0) {
stop("'n.cells' must be greater than zero")
}
# check proportions: avoid num.sim.spots too low
total.train <- ceiling(num.sim.spots * train.freq.spots)
total.test <- num.sim.spots - total.train
if (total.test == 0)
stop("'num.sim.spots' too low compared to 'train.freq.spots'")
nums.train <- .setHundredLimit(
ceiling(total.train * proportion.method),
limit = total.train
)
nums.test <- .setHundredLimit(
ceiling(total.test * proportion.method),
limit = total.test
)
if (verbose) {
message(paste("\n=== The number of mixed profiles that will be generated",
"is equal to", num.sim.spots))
}
# split data into training and test sets
cells <- cells.metadata[, cell.ID.column]
names(cells) <- cells.metadata[, cell.type.column]
# train set:
# balanced.type.cells == TRUE: same number of cells by cell types
# balanced.type.cells == FALSE: completely random
if (!balanced.type.cells) {
train.set <- sample(cells, size = round(nrow(cells.metadata) * train.freq.cells))
# check if we have all cell types in train and test data
test.set.tmp <- cells[!cells %in% train.set]
if (length(unique(names(train.set))) != length(unique(names(cells))) ||
length(unique(names(test.set.tmp))) != length(unique(names(cells)))) {
train.set <- lapply(
X = as.list(unique(names(cells))),
FUN = function(x, cells, train.freq.cells) {
sample(
cells[names(cells) == x],
size = round(length(cells[names(cells) == x]) * train.freq.cells)
)
}, cells = cells, train.freq.cells
) %>% unlist()
}
} else {
train.set <- lapply(
X = as.list(unique(names(cells))),
FUN = function(x, cells, train.freq.cells) {
sample(
cells[names(cells) == x],
size = round(length(cells[names(cells) == x]) * train.freq.cells)
)
}, cells = cells, train.freq.cells
) %>% unlist()
}
train.types <- names(train.set)
train.set.list <- list()
# TODO: I order them alphabetically, but not in the actual order they are in the object
# sort cell types in order to speed up reading times in HDF5 files:
# same order as stored simulated cells in HDF5 file (unnecessary)
if (!is.null(zinb.params(object))) {
model.cell.types <- grep(
pattern = cell.type.column,
x = colnames(zinb.params(object)@zinbwave.model@X),
value = TRUE
)
cell.type.names <- sub(
pattern = cell.type.column, replacement = "", x = model.cell.types
)
if (any(levels(factor(train.types)) %in% cell.type.names)) {
cell.type.names <- c(
cell.type.names,
setdiff(levels(factor(train.types)), cell.type.names)
)
}
} else {
cell.type.names <- levels(factor(train.types))
}
cell.type.train <- cell.type.names[cell.type.names %in% levels(factor(train.types))]
for (ts in cell.type.train) {
train.set.list[[ts]] <- train.set[train.types == ts]
}
# test set
test.set <- cells[!cells %in% train.set]
test.types <- names(test.set)
test.set.list <- list()
cell.type.test <- cell.type.names[cell.type.names %in% levels(factor(test.types))]
for (ts in cell.type.test) {
test.set.list[[ts]] <- test.set[test.types == ts]
}
# check if we have all the cell types in test data
if (length(unique(names(test.set))) != length(unique(names(cells)))) {
stop(
paste(
"Not all cell types considered by SpatialDDLS are in test",
"data. SpatialDDLS needs to have all cell types in both subsets",
"(training and test). Please, provide a bigger single-cell experiment",
"or consider simulate new single-cell profiles with",
"'estimateZinbwaveParams' and 'simSCProfiles' functions"
)
)
}
# check if all cell types are present in train and test data
if (verbose) {
message("\n=== Training set cells by type:")
tb <- unlist(lapply(train.set.list, length))
message(paste0(" - ", names(tb), ": ", tb, collapse = "\n"), "\n")
message("=== Test set cells by type:")
tb <- unlist(lapply(test.set.list, length))
message(paste0(" - ", names(tb), ": ", tb, collapse = "\n"), "\n")
}
n.cell.types <- length(unique(train.types))
functions.list <- list(.generateSetDir, .generateSetPure, .generateSetDir)
if (is.null(min.zero.prop)) {
min.zero.prop <- ceiling(n.cell.types / 2)
} else {
if (min.zero.prop > n.cell.types - 2 | min.zero.prop < 2) {
warning(
"min.zero.prop cannot be greater than the number of cell types - 2. Setting min.zero.prop = n.cell.types - 2",
call. = FALSE, immediate. = TRUE
)
min.zero.prop <- n.cell.types - 2
}
}
# TRAIN SETS #################################################################
train.prob.matrix <- matrix(
NA_real_, nrow = sum(nums.train), ncol = n.cell.types
)
colnames(train.prob.matrix) <- cell.type.train
train.plots <- list()
n <- 1
loc <- c(0, cumsum(nums.train))
for (fun in seq_along(functions.list)) {
if (nums.train[n] == 0) {
n <- n + 1
next
}
index.ex <- switch(
as.character(fun),
"1" = .generateExcludedTypes(
num = nums.train[n],
s.cells = total.train,
n.cell.types = n.cell.types,
prob.sparity = prob.sparity
),
"2" = NULL,
"3" = .generateExcludedTypesScaled(
num = nums.train[n],
s.cells = total.train,
n.cell.types = n.cell.types,
min.zero.prop = min.zero.prop
)
)
train.probs <- functions.list[[fun]](
cell.type = cell.type.train,
num = nums.train[n],
s.cells = total.train,
n.cell.types = n.cell.types,
index.ex = index.ex
)
train.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- train.probs
train.plots[[n]] <- .plotsQCSets(
probs = train.probs,
prob.matrix = train.prob.matrix,
n = n,
set = "train"
)
n <- n + 1
}
# check if matrix is correctly built
if (is.null(dim(train.prob.matrix)))
train.prob.matrix <- matrix(train.prob.matrix, nrow = 1)
# if (is.null(colnames(train.prob.matrix)))
# colnames(train.prob.matrix) <- prob.design[, cell.type.column]
rownames(train.prob.matrix) <- paste("Spot", "train", seq(dim(train.prob.matrix)[1]),
sep = "_")
train.plots <- train.plots[!unlist(lapply(train.plots, is.null))]
train.method <- unlist(
mapply(
FUN = function(name.method, num.samp) rep(name.method, num.samp),
name.method = paste("Method", seq(1, length(nums.train))),
num.samp = nums.train
)
)
names(train.method) <- paste0("Spot_train_", seq(sum(nums.train)))
if (verbose) {
message("=== Probability matrix for training data:")
message(paste(c(" - Mixed spots:", " - Cell types:"),
dim(train.prob.matrix),
collapse = "\n"), "\n")
}
# TEST SETS ##################################################################
test.prob.matrix <- matrix(NA_real_, nrow = sum(nums.test), ncol = n.cell.types)
test.plots <- list()
n <- 1
# hardcoded because the last function cannot received prob.sparity > 0
# prob.sparity[length(prob.sparity)] <- 0
loc <- c(0, cumsum(nums.test))
for (fun in seq_along(functions.list)) {
if (nums.test[n] == 0) {
n <- n + 1
next
}
index.ex <- switch(
as.character(fun),
"1" = .generateExcludedTypes(
num = nums.test[n],
s.cells = total.test,
n.cell.types = n.cell.types,
prob.sparity = prob.sparity
),
"2" = NULL,
"3" = .generateExcludedTypesScaled(
num = nums.test[n],
s.cells = total.test,
n.cell.types = n.cell.types,
min.zero.prop = min.zero.prop
)
)
test.probs <- functions.list[[fun]](
cell.type = cell.type.test,
num = nums.test[n],
s.cells = total.test,
n.cell.types = n.cell.types,
index.ex = index.ex
)
test.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- test.probs
test.plots[[n]] <- .plotsQCSets(
probs = test.probs,
prob.matrix = test.prob.matrix,
n = n,
set = "test"
)
n <- n + 1
}
# check if matrix is correctly built
if (is.null(dim(test.prob.matrix)))
test.prob.matrix <- matrix(test.prob.matrix, nrow = 1)
if (is.null(colnames(test.prob.matrix)))
colnames(test.prob.matrix) <- cell.type.test
rownames(test.prob.matrix) <- paste("Spot", "test", seq(dim(test.prob.matrix)[1]),
sep = "_")
test.plots <- test.plots[!unlist(lapply(test.plots, is.null))]
test.method <- unlist(
mapply(
FUN = function(name.method, num.samp) rep(name.method, num.samp),
name.method = paste("Method", seq(1, length(nums.test))) ,
num.samp = nums.test
)
)
names(test.method) <- paste0("Spot_test_", seq(sum(nums.test)))
if (verbose) {
message("=== Probability matrix for test data:")
message(
paste(
c(" - Mixed spots:", " - Cell types:"),
dim(test.prob.matrix), collapse = "\n"
),
"\n"
)
}
# GENERATE PROB MATRIX NAMES
train.prob.matrix.names <- t(apply(
X = train.prob.matrix,
MARGIN = 1,
FUN = setCount,
setList = train.set.list,
sn = colnames(train.prob.matrix),
n.cells = n.cells
))
test.prob.matrix.names <- t(apply(
X = test.prob.matrix,
MARGIN = 1,
FUN = setCount,
setList = test.set.list,
sn = colnames(test.prob.matrix),
n.cells = n.cells
))
# generate object
train.prob.matrix.object <- new(
Class = "PropCellTypes",
prob.matrix = train.prob.matrix,
cell.names = train.prob.matrix.names,
set.list = train.set.list,
set = train.set,
method = train.method,
plots = train.plots,
type.data = "train"
)
test.prob.matrix.object <- new(
Class = "PropCellTypes",
prob.matrix = test.prob.matrix,
cell.names = test.prob.matrix.names,
set.list = test.set.list,
set = test.set,
method = test.method,
plots = test.plots,
type.data = "test"
)
prob.cell.types(object) <- list(
train = train.prob.matrix.object,
test = test.prob.matrix.object
)
if (verbose) message("DONE")
return(object)
}
.violinPlot <- function(
df,
title,
x = "CellType",
y = "Prob"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
geom_violin() + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.boxPlot <- function(
df,
title,
x = "CellType",
y = "Prob"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
geom_boxplot() + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.linesPlot <- function(
df,
title,
x = "CellType",
y = "Prob",
group = "Sample"
) {
plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]],
group = .data[[group]])) +
geom_line(colour = "grey60") + ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
return(plot)
}
.plotsQCSets <- function(
probs,
prob.matrix,
n,
set
) {
title <- paste0("Bulk Probability Dist. Set ", n, " (", set, ")")
n.samples <- paste("# samples:", dim(probs)[1])
plots.functions <- list(.violinPlot, .boxPlot, .linesPlot)
df <- reshape2::melt(probs)
colnames(df) <- c("Sample", "CellType", "Prob")
# first three plots
plot.list <- lapply(plots.functions, function(f) f(df, paste(title, n.samples)))
# final plots
dummy <- t(apply(as.matrix(stats::na.omit(prob.matrix)), 1, sort, decreasing = T))
df <- reshape2::melt(dummy)
colnames(df) <- c("Sample", "nCellTypes", "Prob")
df$nCellTypes <- factor(df$nCellTypes)
plot.list[[4]] <- .boxPlot(df = df, x = "nCellTypes", title = title)
names(plot.list) <- c("violinplot", "boxplot", "linesplot", "ncelltypes")
return(plot.list)
}
# TODO: is there something wrong??
setCount <- function(
x,
setList,
sn,
n.cells
) {
names(x) <- sn
sc <- c()
x.set <- .setHundredLimit(x = (x * n.cells) / 100, limit = n.cells)
for (cType in names(x)) {
n <- ceiling(x.set[cType]) # n <- ceiling(x[cType])
if (n > 0) {
if (is.null(setList[[cType]]))
next
repl <- ifelse(test = n > length(setList[[cType]]), yes = TRUE, no = FALSE)
sc <- c(sc, sample(setList[[cType]], size = n, replace = repl))
}
}
return(sc[seq(n.cells)])
}
# introduce zeros in selected proportions
.cellExcluder <- function(vec, index.ex) {
# sel <- sample(x = index.ex, size = length(index.ex)) # -1
vec[index.ex] <- 0
return(list(vec, index.ex))
}
# recursive implementation
# this function takes 1 cell type from the available ones (without
# using index.ex cell types) and add up or subtract depending on what is needed
.setHundredLimit <- function(
x,
index.ex = NULL,
limit = 100
) {
if (sum(x) > limit) {
while (TRUE) {
if (is.null(index.ex)) {
sel <- sample(x = seq(length(x)), size = 1)
} else {
if (length(index.ex) != length(x) - 1) {
sel <- sample(x = which(x != 0), size = 1)
} else {
sel <- sample(x = seq(length(x))[-index.ex], size = 1)
}
}
res <- x[sel] - abs(sum(x) - limit)
if (res < 0) res <- x[sel] - sample(x = x[sel], size = 1)
if (res >= 0) break
}
x[sel] <- res
} else if (sum(x) < limit) {
while (TRUE) {
if (is.null(index.ex)) {
sel <- sample(x = seq(length(x)), size = 1)
} else {
sel <- sample(x = seq(length(x))[-index.ex], size = 1)
}
res <- x[sel] + abs(sum(x) - limit)
if (res <= limit) break
}
x[sel] <- res
}
if (sum(x) != limit)
return(.setHundredLimit(x = x, index.ex = index.ex, limit = limit))
else
return(x)
}
.generateExcludedTypes <- function(
num,
s.cells,
n.cell.types,
prob.sparity
) {
# if (prob.sparity > 1 || prob.sparity < 0)
# stop("'prob.sparity' must be an integer between 0 and 1")
prob.sparity <- 1 - prob.sparity
# number of cell types equal to zero have the same probability: only changes
# the probability of having 0 zeros. Maybe this could be improvable
random.zeros <- function(prob.sparity) {
num.zero <- sample(
seq(0, n.cell.types - 1), size = 1,
prob = c(
prob.sparity, rep((1 - prob.sparity) / (n.cell.types - 1), n.cell.types - 1)
)
)
# num.zero <- sample(x = seq(1, n.cell.types - 1), size = 1)
if (num.zero == 0) {
return(NULL)
} else {
res <- sample(seq(n.cell.types), size = num.zero)
if (length(res) == n.cell.types)
res <- res[-sample(x = n.cell.types, size = 1)]
}
return(res)
}
return(
replicate(
n = num,
expr = random.zeros(prob.sparity = prob.sparity)
)
)
}
## same as before but without any prob and introducing the number of zeros manually
.generateExcludedTypesScaled <- function(
num,
s.cells,
n.cell.types,
min.zero.prop = ceiling(n.cell.types / 2)
) {
n.samples.each <- rep(
ceiling(num / (n.cell.types - min.zero.prop)), n.cell.types - min.zero.prop
)
n.samples.each <- .setHundredLimit(n.samples.each, limit = num)
return(
sapply(
rep(seq(min.zero.prop, n.cell.types - 1), n.samples.each),
FUN = function(num.excl) sample(seq(n.cell.types), size = num.excl)
)
)
}
# generate spots based on a Dirichlet distribution
.generateSetDir <- function(
cell.types,
num,
s.cells,
n.cell.types,
index.ex
) {
prob.matrix <- do.call(
what = rbind,
args = lapply(
X = seq(num),
FUN = function(x) {
p <- gtools::rdirichlet(
n = 1,
alpha = .cellExcluder(
rep(1, n.cell.types), index.ex = index.ex[[x]]
)[[1]]
)
p <- round(p * 100 / sum(p))
return(.setHundredLimit(x = p, index.ex = index.ex[[x]]))
}
)
)
colnames(prob.matrix) <- cell.types
return(prob.matrix)
}
# generate spots consisted of only 1 cell type
.generateSetPure <- function(
cell.types,
num,
s.cells,
n.cell.types,
index.ex
) {
prob.matrix <- matrix(0, nrow = num, ncol = n.cell.types)
num.set <- .setHundredLimit(
x = rep(ceiling(num / n.cell.types), n.cell.types), limit = num
)
num.index <- c(0, cumsum(num.set))
# which(num.set != 0) because may be cell types without pseudo-bulk --> error
for (i in which(num.set != 0)) {
prob.matrix[seq(num.index[i] + 1, num.index[i + 1]), i] <- 100
}
colnames(prob.matrix) <- cell.types
return(prob.matrix)
}
################################################################################
########################## Simulate mixed profiles #############################
################################################################################
#' Simulate training and test mixed spot profiles
#'
#' Simulate training and test mixed spot transcriptional profiles using cell
#' composition matrices generated by the \code{\link{genMixedCellProp}}
#' function.
#'
#' Mixed profiles are generated under the assumption that the expression level
#' of a particular gene in a given spot is the sum of the expression levels of
#' the cell types that make it up weighted by their proportions. In practice, as
#' described in Torroja and Sanchez-Cabo, 2019, these profiles are generated by
#' summing gene expression levels of a determined number of cells specified by a
#' known cell composition matrix. The number of simulated spots and cells used
#' to simulate each spot are determined by the \code{\link{genMixedCellProp}}
#' function. This step can be avoided by using the \code{on.the.fly} argument in
#' the \code{\link{trainDeconvModel}} function.
#'
#' \pkg{SpatialDDLS} allows to use HDF5 files as back-end to store simulated
#' data using the \pkg{DelayedArray} and \pkg{HDF5Array} packages. This
#' functionality allows to work without keeping the data loaded into RAM, which
#' could be useful during some computationally heavy steps such as neural
#' network training on RAM-limited machines. You must provide a valid file path
#' in the \code{file.backend} argument to store the resulting file with the
#' '.h5' extension. This option slows down execution times, as subsequent
#' transformations of the data will be done in blocks. Note that if you use the
#' \code{file.backend} argument with \code{block.processing = FALSE}, all mixed
#' profiles will be simulated in one step and, thus, loaded into RAM. Then, the
#' matrix will be written to an HDF5 file. To avoid the RAM collapse, these
#' profiles can be simulated and written to HDF5 files in blocks of
#' \code{block.size} size by setting \code{block.processing = TRUE}. We
#' recommend this option accordingly to the computational resources available
#' and the number of simulated spots to be generated, but, in most of the cases,
#' it is not necessary.
#'
#' @param object \code{\linkS4class{SpatialDDLS}} object with
#' \code{single.cell.real}/\code{single.cell.simul}, and
#' \code{prob.cell.types} slots.
#' @param type.data Type of data to generate: \code{'train'}, \code{'test'} or
#' \code{'both'} (the last by default).
#' @param mixing.function Function used to build mixed transcriptional profiles.
#' It may be: \itemize{ \item \code{"AddRawCount"}: single-cell profiles (raw
#' counts) are added up across cells. Then, log-CPMs are calculated (by
#' default). \item \code{"MeanCPM"}: single-cell profiles (raw counts) are
#' transformed into CPMs and cross-cell averages are calculated. Then,
#' \code{log2(CPM + 1)} is calculated. \item \code{"AddCPM"}: single-cell
#' profiles (raw counts) are transformed into CPMs and are added up across
#' cells. Then, log-CPMs are calculated.}
#' @param file.backend Valid file path to store simulated mixed expression
#' profiles as an HDF5 file (\code{NULL} by default). If provided, data are
#' stored in HDF5 files used as back-end by using the \pkg{DelayedArray},
#' \pkg{HDF5Array} and \pkg{rhdf5} packages instead of loading all data into
#' RAM. Note that operations on this matrix will be performed in blocks (i.e
#' subsets of determined size) which may result in longer execution times.
#' @param compression.level The compression level used if \code{file.backend} is
#' provided. It is an integer value between 0 (no compression) and 9 (highest
#' and slowest compression). See
#' \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} from the
#' \pkg{HDF5Array} package for more information.
#' @param block.processing Boolean indicating whether data should be simulated
#' in blocks (only if \code{file.backend} is used, \code{FALSE} by default).
#' This functionality is suitable for cases where it is not possible to load
#' all data into memory, and it leads to longer execution times.
#' @param block.size Only if \code{block.processing = TRUE}. Number of mixed
#' expression profiles that will be simulated in each iteration. Larger
#' numbers result in higher memory usage but shorter execution times. Set
#' accordingly to available computational resources (1000 by default).
#' @param chunk.dims Specifies the dimensions that HDF5 chunk will have. If
#' \code{NULL}, the default value is a vector of two items: the number of
#' genes considered by \code{\linkS4class{SpatialDDLS}} object during the
#' simulation, and a single sample to reduce read times in the following
#' steps. A larger number of columns written in each chunk can lead to longer
#' read times.
#' @param threads Number of threads used during simulation (1 by default).
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#' default).
#'
#' @return A \code{\linkS4class{SpatialDDLS}} object with \code{mixed.profiles}
#' slot containing a list with one or two entries (depending on selected
#' \code{type.data} argument): \code{'train'} and \code{'test'}. Each entry
#' consists of a
#' \code{\link[SummarizedExperiment]{SummarizedExperiment}}
#' object with the simulated mixed slot profiles.
#'
#' @export
#'
#' @seealso \code{\link{genMixedCellProp}} \code{\linkS4class{PropCellTypes}}
#' \code{\link{trainDeconvModel}}
#'
#' @examples
#' set.seed(123)
#' sce <- SingleCellExperiment::SingleCellExperiment(
#' assays = list(
#' counts = matrix(
#' rpois(100, lambda = 5), nrow = 40, ncol = 30,
#' dimnames = list(paste0("Gene", seq(40)), paste0("RHC", seq(30)))
#' )
#' ),
#' colData = data.frame(
#' Cell_ID = paste0("RHC", seq(30)),
#' Cell_Type = sample(x = paste0("CellType", seq(4)), size = 30,
#' replace = TRUE)
#' ),
#' rowData = data.frame(
#' Gene_ID = paste0("Gene", seq(40))
#' )
#' )
#'
#' SDDLS <- createSpatialDDLSobject(
#' sc.data = sce,
#' sc.cell.ID.column = "Cell_ID",
#' sc.gene.ID.column = "Gene_ID",
#' sc.filt.genes.cluster = FALSE,
#' project = "Simul_example"
#' )
#' SDDLS <- genMixedCellProp(
#' object = SDDLS,
#' cell.ID.column = "Cell_ID",
#' cell.type.column = "Cell_Type",
#' num.sim.spots = 10,
#' train.freq.cells = 2/3,
#' train.freq.spots = 2/3,
#' verbose = TRUE
#' )
#' SDDLS <- simMixedProfiles(SDDLS, verbose = TRUE)
#'
#' @references Fischer B, Smith M and Pau, G (2020). rhdf5: R Interface to HDF5.
#' R package version 2.34.0.
#'
#' Pagès H, Hickey P and Lun A (2020). DelayedArray: A unified framework for
#' working transparently with on-disk and in-memory array-like datasets. R
#' package version 0.16.0.
#'
#' Pagès H (2020). HDF5Array: HDF5 backend for DelayedArray objects. R package
#' version 1.18.0.
#'
simMixedProfiles <- function(
object,
type.data = "both",
mixing.function = "AddRawCount",
file.backend = NULL,
compression.level = NULL,
block.processing = FALSE,
block.size = 1000,
chunk.dims = NULL,
threads = 1,
verbose = TRUE
) {
if (!is(object, "SpatialDDLS")) {
stop("The object provided is not of SpatialDDLS class")
} else if (is.null(single.cell.real(object))) {
stop("There are no real single-cell profiles in SpatialDDLS object")
} else if (is.null(prob.cell.types(object))) {
stop("'prob.cell.types' slot is empty")
} else if (!any(type.data == c("train", "test", "both"))) {
stop("'type.data' argument must be one of the following options: 'train', 'test' or 'both'")
}
if (!mixing.function %in% c("MeanCPM", "AddCPM", "AddRawCount", "AddRawCountLogCPM")) {
stop("'mixing.function' must be one of the following options: 'MeanCPM', 'AddCPM', 'AddRawCountLogCPM', 'AddRawCount'")
} else {
if (mixing.function == "MeanCPM") {
.agg.fun <- aggregation.fun.mean.cpm
} else if (mixing.function == "AddCPM") {
.agg.fun <- aggregation.fun.add.cpm
} else if (mixing.function == "AddRawCountLogCPM") { ## not updated in docs, it shouldn't be used...
.agg.fun <- aggregation.fun.add.raw.counts.cpm
} else if (mixing.function == "AddRawCount") {
.agg.fun <- aggregation.fun.add.raw.counts
}
}
if (!is.null(file.backend)) {
if (!requireNamespace("DelayedArray", quietly = TRUE) ||
!requireNamespace("HDF5Array", quietly = TRUE)) {
stop("SpatialDDLS provides the possibility of using HDF5 files as back-end
when data are too big to be located in RAM. It uses DelayedArray,
HDF5Array and rhdf5 to do it. Please install both packages to
use this functionality")
}
if (file.exists(file.backend)) {
stop("'file.backend' already exists. Please provide a correct file path")
}
if (is.null(compression.level)) {
compression.level <- HDF5Array::getHDF5DumpCompressionLevel()
} else {
if (compression.level < 0 || compression.level > 9) {
stop("'compression.level' must be an integer between 0 (no compression) ",
"and 9 (highest and slowest compression). ")
}
}
}
if (threads <= 0) threads <- 1
if (verbose) {
message(paste("=== Setting parallel environment to", threads, "thread(s)"))
}
if (type.data == "both") {
if (!is.null(object@mixed.profiles)) {
warning("'mixed.profiles' slot will be overwritten\n\n",
call. = FALSE, immediate. = TRUE)
}
bulk.counts <- lapply(
X = c("train", "test"),
FUN = function(x) {
if (verbose) {
message(paste("\n=== Generating", x, "mixed profiles:"))
}
.generateBulkProfiles(
object = object,
type.data = x,
fun.pseudobulk = .agg.fun,
unit = mixing.function,
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
block.size = block.size,
chunk.dims = chunk.dims,
threads = threads,
verbose = verbose
)
}
)
names(bulk.counts) <- c("train", "test")
object@mixed.profiles <- bulk.counts
} else {
if (!is.null(mixed.profiles(object)) &&
type.data %in% names(mixed.profiles(object))) {
warning(
paste(
type.data, "data in 'mixed.profiles' slot will be overwritten",
"\n\n"
),
call. = FALSE, immediate. = TRUE
)
}
if (verbose) {
message(paste("\n=== Generating", type.data, "mixed profiles:"))
}
bulk.counts <- .generateBulkProfiles(
object = object,
type.data = type.data,
fun.pseudobulk = .agg.fun,
unit = mixing.function,
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
block.size = block.size,
chunk.dims = chunk.dims,
threads = threads,
verbose = verbose
)
if (!is.null(mixed.profiles(object))) {
if (type.data %in% names(mixed.profiles(object))) {
mixed.profiles(object, type.data) <- NULL
}
mixed.profiles(object) <- c(
mixed.profiles(object), type.data = bulk.counts
)
} else {
mixed.profiles(object) <- list(bulk.counts)
names(mixed.profiles(object)) <- type.data
}
}
if (verbose) message("\nDONE")
return(object)
}
.generateBulkProfiles <- function(
object,
type.data,
fun.pseudobulk,
unit,
file.backend,
compression.level,
block.processing,
block.size,
chunk.dims,
threads,
verbose
) {
sel.bulk.cells <- prob.cell.types(object, type.data) %>% cell.names()
sel.bulk.cells <- sel.bulk.cells[sample(nrow(sel.bulk.cells)), ]
if (!is.null(single.cell.simul(object))) {
suffix.names <- unique(colData(single.cell.simul(object))$suffix)
} else {
suffix.names <- "_Simul"
}
pattern <- suffix.names
if (!verbose) pbo <- pbapply::pboptions(type = "none")
if (block.processing && is.null(file.backend)) {
stop("'block.processing' is only compatible to the use of HDF5 files ",
"as back-end ('file.backend' argument)")
} else if (block.processing && !is.null(file.backend)) {
n <- nrow(sel.bulk.cells)
J <- nrow(assay(single.cell.real(object)))
if (n < block.size) {
block.size <- n
warning("The number of simulated samples is less than 'block.size'. ",
"A single block will be performed.",
call. = FALSE, immediate. = TRUE)
}
if (is.null(chunk.dims) || length(chunk.dims) != 2) chunk.dims <- c(J, 1)
if (!file.exists(file.backend)) rhdf5::h5createFile(file.backend)
rhdf5::h5createDataset(
file.backend, type.data,
dims = c(J, block.size),
maxdims = c(J, n),
chunk = chunk.dims,
storage.mode = "double"
)
r.i <- 0
# iteration over cells
for (iter in seq(ceiling(n / block.size))) {
if (verbose) message(paste(" - Writing block", iter))
if ((block.size * iter) - n > 0) {
dif <- block.size
dif.2 <- (block.size * iter) - n
block.size <- block.size - dif.2
} else {
dif <- block.size
}
sub.i <- seq(from = r.i + 1, to = r.i + block.size)
r.i <- r.i + block.size
bulk.samples <- pbapply::pbapply(
X = sel.bulk.cells[sub.i, , drop = FALSE],
MARGIN = 1,
FUN = .setBulk,
object = object,
pattern = pattern,
fun.pseudobulk = fun.pseudobulk,
cl = threads
)
if (iter == 1) {
rhdf5::h5write(
obj = bulk.samples, file = file.backend,
name = type.data, level = compression.level
)
} else {
# check number of cells in the next loop
rhdf5::h5set_extent(file.backend, type.data, dims = c(J, n))
rhdf5::h5write(
obj = bulk.samples,
file = file.backend, name = type.data,
index = list(seq(J), seq((dif * (iter - 1)) + 1,
(dif * (iter - 1)) + ncol(bulk.samples))),
level = compression.level
)
}
}
rhdf5::H5close()
# HDF5Array object for SingleCellExperiment class
bulk.samples <- HDF5Array::HDF5Array(file = file.backend, name = type.data)
dimnames(bulk.samples) <- list(rownames(assay(single.cell.real(object))),
rownames(sel.bulk.cells))
} else if (!block.processing) {
bulk.samples <- pbapply::pbapply(
X = sel.bulk.cells,
MARGIN = 1,
FUN = .setBulk,
object = object,
pattern = pattern,
fun.pseudobulk = fun.pseudobulk,
cl = threads
)
}
if (!verbose) on.exit(pbapply::pboptions(pbo))
return(
.createSEObject(
counts = bulk.samples,
samples.metadata = prob.cell.types(
object, type.data
)@prob.matrix[rownames(sel.bulk.cells), ],
genes.metadata = rownames(assay(single.cell.real(object))),
mixing.fun = unit,
file.backend = file.backend,
compression.level = compression.level,
block.processing = block.processing,
type.data = type.data,
verbose = verbose
)
)
}
.createSEObject <- function(
counts,
samples.metadata,
genes.metadata,
mixing.fun,
file.backend,
compression.level,
block.processing,
type.data,
verbose
) {
# could be a check of counts class -> if (is(counts, "HDF5Array"))
if (!is.null(file.backend) && !block.processing) {
counts <- .useH5backend(
counts = counts,
file.backend = file.backend,
compression.level = compression.level,
group = type.data,
sparse = FALSE,
verbose = verbose
)
}
return(
SummarizedExperiment::SummarizedExperiment(
assays = list(counts = counts),
colData = samples.metadata,
rowData = genes.metadata,
metadata = list(mixing.fun = mixing.fun)
)
)
}
.cpmCalculate <- function(x, fact = 10000) {
if (is.null(dim(x))) return((x / sum(x)) * fact)
else return(apply(X = x, MARGIN = 2, FUN = function(x) (x / sum(x)) * fact))
}
aggregation.fun.mean.cpm <- function(x) {
rowMeans(log2(.cpmCalculate(x = x + 1)))
}
aggregation.fun.add.cpm <- function(x) {
.cpmCalculate(x = rowSums(log2(.cpmCalculate(x = x + 1))))
}
aggregation.fun.add.raw.counts.cpm <- function(x) {
log2(.cpmCalculate(x = rowSums(x) + 1))
}
aggregation.fun.add.raw.counts <- function(x) {
rowSums(x)
}
.setBulk <- function(x, object, pattern, fun.pseudobulk) {
sep.b <- grepl(pattern = pattern, x = x)
if (any(sep.b)) {
cols.sim <- match(
x = x[sep.b], table = colnames(single.cell.simul(object))
) %>% sort()
cols.real <- match(
x = x[!sep.b], table = colnames(single.cell.real(object))
) %>% sort()
sim.counts <- as.matrix(
assay(single.cell.simul(object))[, cols.sim, drop = FALSE]
)
real.counts <- as.matrix(
assay(single.cell.real(object))[, cols.real, drop = FALSE]
)
counts <- .mergeMatrices(x = real.counts, y = sim.counts) # merge matrices
} else if (all(sep.b)) {
cols <- sort(match(x = x[sep.b], table = colnames(single.cell.simul(object))))
counts <- as.matrix(assay(single.cell.simul(object))[, cols, drop = FALSE])
} else {
cols <- match(x = x, table = colnames(single.cell.real(object))) %>% sort()
counts <- as.matrix(assay(single.cell.real(object))[, cols, drop = FALSE])
}
return(fun.pseudobulk(x = counts))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.