R/simBulk.R

Defines functions .setBulk pseudobulk.fun.add.raw.counts pseudobulk.fun.add.raw.counts.cpm pseudobulk.fun.add.cpm pseudobulk.fun.mean.cpm .cpmCalculate .createSEObject .generateBulkProfiles simBulkProfiles .generateSet6 .generateSet5 .generateSet4 .generateSet3 .generateSet2 .generateSet1 .generateExcludedTypes .adjustHundred .setHundredLimit .cellExcluder setCount .plotsQCSets .linesPlot .boxPlot .violinPlot generateBulkCellMatrix

Documented in generateBulkCellMatrix simBulkProfiles

#' @importFrom ggplot2 ggplot aes geom_violin geom_boxplot geom_line theme ggtitle element_text
NULL


################################################################################
####################### Generate cell composition matrix #######################
################################################################################

#' Generate training and test cell composition matrices
#'
#' Generate training and test cell composition matrices for the simulation of
#' pseudo-bulk RNA-Seq samples with known cell composition using single-cell
#' expression profiles. The resulting \code{\linkS4class{ProbMatrixCellTypes}}
#' object contains a matrix that determines the proportion of the different cell
#' types that will compose the simulated pseudo-bulk samples. In addition, this
#' object also contains other information relevant for the process. This
#' function does not simulate pseudo-bulk samples, this task is performed by the
#' \code{\link{simBulkProfiles}} or \code{\link{trainDDLSModel}}
#' functions (see Documentation).
#'
#' First, the available single-cell profiles are split into training and test
#' subsets (2/3 for training and 1/3 for test by default (see
#' \code{train.freq.cells})) to avoid falsifying the results during model
#' evaluation. Next, \code{num.bulk.samples} bulk samples proportions are built
#' and the single-cell profiles to be used to simulate each pseudo-bulk RNA-Seq
#' sample are set, being 100 cells per bulk sample by default (see
#' \code{n.cells} argument). The proportions of training and test pseudo-bulk
#' samples are set by \code{train.freq.bulk} (2/3 for training and 1/3 for
#' testing by default). Finally, in order to avoid biases due to the composition
#' of the pseudo-bulk RNA-Seq samples, cell type proportions (\eqn{w_1,...,w_k},
#' where \eqn{k} is the number of cell types available in single-cell profiles)
#' are randomly generated by using six different approaches:
#'
#' \enumerate{ \item Cell proportions are randomly sampled from a truncated
#' uniform distribution with predefined limits according to a priori knowledge
#' of the abundance of each cell type (see \code{prob.design} argument). This
#' information can be inferred from the single-cell experiment itself or from
#' the literature. \item A second set is generated by randomly permuting cell
#' type labels from a distribution generated by the previous method. \item Cell
#' proportions are randomly sampled as by method 1 without replacement. \item
#' Using the last method for generating proportions, cell types labels are
#' randomly sampled. \item Cell proportions are randomly sampled from a
#' Dirichlet distribution. \item Pseudo-bulk RNA-Seq samples composed of the
#' same cell type are generated in order to provide 'pure' pseudo-bulk samples.}
#'
#' If you want to inspect the distribution of cell type proportions generated by
#' each method during the process, they can be visualized by the
#' \code{\link{showProbPlot}} function (see Documentation).
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with
#'   \code{single.cell.real} slot and, optionally, with \code{single.cell.simul}
#'   slot.
#' @param cell.ID.column Name or column number corresponding to the cell names
#'   of expression matrix in cells metadata.
#' @param cell.type.column Name or column number corresponding to the cell type
#'   of each cell in cells metadata.
#' @param prob.design Data frame with the expected frequency ranges for each
#'   cell type present in the experiment. This information can be estimated from
#'   literature or from the single-cell experiment itself. This data frame must
#'   be constructed by three columns with specific headings (see examples):
#'   \itemize{ \item A cell type column with the same name of the cell type
#'   column in cells metadata (\code{cell.type.column}). If the name of the
#'   column is not the same, the function will return an error. All cell types
#'   must appear in the cells metadata. \item A second column called
#'   \code{'from'} with the start frequency for each cell type. \item A third
#'   column called \code{'to'} with the ending frequency for each cell type.}
#' @param num.bulk.samples Number of bulk RNA-Seq sample proportions (and thus
#'   simulated bulk RNA-Seq samples) to be generated taking into account
#'   training and test data. We recommend seting this value according to the
#'   number of single-cell profiles available in
#'   \code{\linkS4class{DigitalDLSorter}} object avoiding an excesive
#'   re-sampling, but generating a large number of samples for better training.
#' @param n.cells Number of cells that will be aggregated in order to simulate
#'   one bulk RNA-Seq sample (100 by default).
#' @param train.freq.cells Proportion of cells used to simulate training
#'   pseudo-bulk samples (2/3 by default).
#' @param train.freq.bulk Proportion of bulk RNA-Seq samples to the total number
#'   (\code{num.bulk.samples}) used for the training set (2/3 by default).
#' @param proportion.method Vector of six integers that determines the
#'   proportions of bulk samples generated by the different methods (see Details
#'   and Torroja and Sanchez-Cabo, 2019. for more information). This vector
#'   represents proportions, so its entries must add up 100. By default, a
#'   majority of random samples will be generated without using predefined
#'   ranges.
#' @param prob.sparsity It only affects the proportions generated by the first
#'   method (Dirichlet distribution). It determines the probability of having
#'   missing cell types in each simulated spot, as opposed to a mixture of all
#'   cell types. A higher value for this parameter will result in more sparse
#'   simulated samples.
#' @param min.zero.prop This parameter controls the minimum number of cell types
#'   that will be absent in each simulated spot. If \code{NULL} (by default),
#'   this value will be half of the total number of different cell types, but
#'   increasing it will result in more spots composed of fewer cell types. This
#'   helps to create more sparse proportions and cover a wider range of
#'   situations during model training.
#' @param balanced.type.cells Boolean indicating whether the training and test
#'   cells will be split in a balanced way considering the cell types
#'   (\code{FALSE} by default).
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#'   default).
#'
#' @return A \code{\linkS4class{DigitalDLSorter}} object with
#'   \code{prob.cell.types} slot containing a \code{list} with two
#'   \code{\linkS4class{ProbMatrixCellTypes}} objects (training and test). For
#'   more information about the structure of this class, see
#'   \code{?\linkS4class{ProbMatrixCellTypes}}.
#'
#' @export
#'
#' @seealso \code{\link{simBulkProfiles}}
#'   \code{\linkS4class{ProbMatrixCellTypes}}
#'
#' @examples
#' set.seed(123) # reproducibility
#' # simulated data
#' sce <- SingleCellExperiment::SingleCellExperiment(
#'   assays = list(
#'     counts = matrix(
#'       rpois(30, lambda = 5), nrow = 15, ncol = 10, 
#'       dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#'     )
#'   ),
#'   colData = data.frame(
#'     Cell_ID = paste0("RHC", seq(10)),
#'     Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10, 
#'                        replace = TRUE)
#'   ),
#'   rowData = data.frame(
#'     Gene_ID = paste0("Gene", seq(15))
#'   )
#' )
#' DDLS <- createDDLSobject(
#'   sc.data = sce,
#'   sc.cell.ID.column = "Cell_ID",
#'   sc.gene.ID.column = "Gene_ID",
#'   sc.filt.genes.cluster = FALSE, 
#'   sc.log.FC = FALSE
#' )
#' probMatrixValid <- data.frame(
#'   Cell_Type = paste0("CellType", seq(2)),
#'   from = c(1, 30),
#'   to = c(15, 70)
#' )
#' DDLS <- generateBulkCellMatrix(
#'   object = DDLS,
#'   cell.ID.column = "Cell_ID",
#'   cell.type.column = "Cell_Type",
#'   prob.design = probMatrixValid,
#'   num.bulk.samples = 10,
#'   verbose = TRUE
#' )
#' 
#' @references Torroja, C. and Sánchez-Cabo, F. (2019). digitalDLSorter: A Deep
#'   Learning algorithm to quantify immune cell populations based on scRNA-Seq
#'   data. Frontiers in Genetics 10, 978. doi: \doi{10.3389/fgene.2019.00978}
#'   
generateBulkCellMatrix <- function(
  object,
  cell.ID.column,
  cell.type.column,
  prob.design,
  num.bulk.samples,
  n.cells = 100,
  train.freq.cells = 3/4,
  train.freq.bulk = 3/4,
  proportion.method = c(10, 5, 20, 15, 35, 15),
  prob.sparsity = 0.5,
  min.zero.prop = NULL,
  balanced.type.cells = FALSE,
  verbose = TRUE
) {
  if (!is(object, "DigitalDLSorter")) {
    stop("The object provided is not of DigitalDLSorter class")
  } else if (is.null(single.cell.real(object))) {
    stop("'single.cell.real' slot is empty")
  } else if (!train.freq.cells <= 0.95 || !train.freq.cells >= 0.05) {
    stop("'train.seq.cells' argument must be less than or equal to 0.95 and ", 
         "greater than or equal to 0.05")
  } else if (!train.freq.bulk <= 0.95 || !train.freq.bulk >= 0.05) {
    stop("'train.seq.bulk' argument must be less than or equal to 0.95 and ", 
         "greater than or equal to 0.05")
  } else if (missing(cell.ID.column) || missing(cell.type.column) || 
             is.null(cell.ID.column) || is.null(cell.type.column)) {
    stop("'cell.ID.column' and 'cell.type.column' arguments are needed. Please, ", 
         "look at ?estimateZinbwaveParams") 
  } else if (!is.data.frame(prob.design)) {
    stop(paste(
      "prob.design must be a data frame with three column names:",
      paste("'cell.type.column': must be equal to 'cell.type.column' in",
            "cells metadata (colData slot of SingleCellExperiment objects)"),
      "'from': frequency from which the cell type might appear",
      "'to': frequency up to which the cell type migth appear", 
      sep = "\n   - ")
    )
  } else if (any(proportion.method < 0)) {
    stop("Proportions cannot be less than zero")
  } else if (sum(proportion.method) != 100) {
    stop("Proportions provided must add up to 100")
  } else if (length(proportion.method) != 6) {
    stop("Proportions must be a vector of 6 elements")
  } else if (prob.sparsity > 1 || prob.sparsity < 0) {
    stop("'prob.sparsity' must be a vector whose elements must be between 0 and 1")
  } else if (missing(num.bulk.samples) || is.null(num.bulk.samples)) {
    stop("'num.bulk.samples' argument must be provided")
  } 
  if (!is.null(prob.cell.types(object)) || !length(prob.cell.types(object)) == 0) {
    warning("'prob.cell.types' slot already has probability matrices. ", 
            "Note that this object will be overwritten\n\n",
            call. = FALSE, immediate. = TRUE)
  }
  if (!all(proportion.method == floor(proportion.method))) 
    stop("Proportions provided must be composed of integers")
  
  if (!is.null(single.cell.simul(object))) {
    # extract data from SCE to list
    list.metadata <- list(
      single.cell.real(object) %>% 
        SingleCellExperiment::colData(),
      single.cell.simul(object) %>% 
        SingleCellExperiment::colData()
    )
    # check if cell.type.column and cell.ID.column are correct
    lapply(
      X = list(list.metadata[[1]], list.metadata[[2]]),
      FUN = function(x) {
        mapply(
          function(y, z) {
            .checkColumn(
              metadata = x,
              ID.column = y,
              type.metadata = "cells.metadata",
              arg = z
            )
          },
          c(cell.ID.column, cell.type.column),
          c("cell.ID.column", "cell.type.column")
        )
      }
    )
    list.metadata[[1]]$Simulated <- FALSE
    list.metadata[[1]]$suffix <- ""
    cells.metadata <- S4Vectors::rbind(list.metadata[[1]], list.metadata[[2]])  
  } else {
    cells.metadata <- single.cell.real(object) %>% 
      SingleCellExperiment::colData()
  }
  # check if prob.design is correctly built
  lapply(
    X = c(cell.type.column, "from", "to"),
    FUN = function(x) {
      .checkColumn(
        metadata = prob.design,
        ID.column = x,
        type.metadata = "prob.design",
        arg = ""
      )
    }
  )
  prob.design <- prob.design %>% as.data.frame()
  if (any(duplicated(prob.design[[cell.type.column]]))) {
    stop(paste("'prob.design' must not contain duplicated cell types in",
               cell.type.column, "column"))
  } else if (!all(prob.design[[cell.type.column]] %in%
                  unique(cells.metadata[, cell.type.column]))) {
    stop("There are some cell types in 'prob.design' that do not appear in ", 
         "cells metadata. Check that the 'prob.design' matrix is correctly built")
  } else if (any(prob.design$from < 1) || any(prob.design$from > 99)) {
    stop("'from' column in 'prob.design' must be greater than or equal to 1 and ",
         "less than or equal to 99")
  } else if (any(prob.design$to <= 1) || any(prob.design$to > 100)) {
    stop("'to' column in 'prob.design' must be greater than or equal to 1 and ", 
         "less than or equal to 100")
  } else if (any(prob.design$from > prob.design$to)) {
    stop("'from' entries must be less than 'to' entries")
  } 
  # check if n.cells is invalid
  if (n.cells <= 0) {
    stop("'n.cells' must be greater than zero")
  } else if (n.cells < length(unique(cells.metadata[, cell.type.column]))) {
    stop("'n.cells' must be equal to or greater than the number of cell types in",
         " experiment. We recommend more than 100 cells per bulk sample")
  }
  # check proportions --> avoid num.bulk.samples too low
  total.train <- ceiling(num.bulk.samples * train.freq.bulk)
  total.test <- num.bulk.samples - total.train
  if (total.test == 0) 
    stop("'num.bulk.samples' too low in relation to 'train.freq.bulk'")
  nums.train <- .setHundredLimit(
    ceiling((total.train * proportion.method) / 100),
    limit = total.train
  )
  nums.test <- .setHundredLimit(
    ceiling((total.test * proportion.method) / 100),
    limit = total.test
  )
  if (verbose) {
    message(paste("\n=== The number of bulk RNA-Seq samples that will be generated", 
                  "is equal to", num.bulk.samples))  
  }
  # split data into training and test sets
  cells <- cells.metadata[, cell.ID.column]
  names(cells) <- cells.metadata[, cell.type.column]
  # train set: 
  # balanced.type.cells == TRUE --> same number of cells by cell types
  # balanced.type.cells == FALSE --> completely random
  if (!balanced.type.cells) {
    train.set <- sample(cells, size = round(nrow(cells.metadata) * train.freq.cells))
    # check if we have all cell types in train and test data
    test.set.tmp <- cells[!cells %in% train.set]
    if (length(unique(names(train.set))) != length(unique(names(cells))) ||
        length(unique(names(test.set.tmp))) != length(unique(names(cells)))) {
      train.set <- lapply(
        X = as.list(unique(names(cells))), 
        FUN = function(x, cells, train.freq.cells) {
          sample(
            cells[names(cells) == x], 
            size = round(length(cells[names(cells) == x]) * train.freq.cells)
          )
        }, cells = cells, train.freq.cells
      ) %>% unlist()  
    }
  } else {
    train.set <- lapply(
      X = as.list(unique(names(cells))), 
      FUN = function(x, cells, train.freq.cells) {
        sample(
          cells[names(cells) == x], 
          size = round(length(cells[names(cells) == x]) * train.freq.cells)
        )
      }, cells = cells, train.freq.cells
    ) %>% unlist()
  }
  train.types <- names(train.set)
  train.set.list <- list()
  # sort cell types in order to speed up reading times in HDF5 files --> 
  # same order as are stored simulated cells in HDF5 file (unnecessary)
  if (!is.null(zinb.params(object))) {
    model.cell.types <- grep(pattern = cell.type.column,
                             x = colnames(zinb.params(object)@zinbwave.model@X),
                             value = T)
    cell.type.names <- sub(pattern = cell.type.column,
                           replacement = "",
                           x = model.cell.types)
    if (any(levels(factor(train.types)) %in% cell.type.names)) {
      cell.type.names <- c(
        cell.type.names,
        setdiff(levels(factor(train.types)), cell.type.names)
      )
    }
  } else {
    cell.type.names <- levels(factor(train.types))
  }
  cell.type.train <- cell.type.names[cell.type.names %in% levels(factor(train.types))]
  for (ts in cell.type.train) {
    train.set.list[[ts]] <- train.set[train.types == ts]
  }
  # test set
  test.set <- cells[!cells %in% train.set]
  test.types <- names(test.set)
  test.set.list <- list()
  cell.type.test <- cell.type.names[cell.type.names %in% levels(factor(test.types))]
  for (ts in cell.type.test) {
    test.set.list[[ts]] <- test.set[test.types == ts]
  }
  # check if we have all the cell types in test data
  if (length(unique(names(test.set))) != length(unique(names(cells)))) {
    stop(
      paste(
        "Not all cell types considered in DigitalDLSorter object are in test",
        "data. digitalDLSorteR needs to have all cell types in both subsets",
        "(training and test). Please, provide a bigger single-cell experiment",
        "or consider simulate new single-cell profiles with", 
        "'estimateZinbwaveParams' and 'simSCProfiles' functions"
      )
    )
  }
  
  # check if all cell types are present in train and test data
  if (verbose) {
    message("\n=== Training set cells by type:")
    tb <- unlist(lapply(train.set.list, length))
    message(paste0("    - ", names(tb), ": ", tb, collapse = "\n"), "\n")
    message("=== Test set cells by type:")
    tb <- unlist(lapply(test.set.list, length))
    message(paste0("    - ", names(tb), ": ", tb, collapse = "\n"), "\n")
  }
  # take prob.design
  prob.list <- lapply(
    split(prob.design, seq(nrow(prob.design))), 
    FUN = function(x) {
      return(seq(from = x[['from']], to = x[['to']]))
    }
  )
  names(prob.list) <- prob.design[, cell.type.column]
  n.cell.types <- length(unique(train.types))
  functions.list <- list(
    .generateSet1, 
    .generateSet2, 
    .generateSet3, 
    # .generateSet4, ## removed to include 5 without excluding cell types
    .generateSet5, ## excluding
    .generateSet5, ## no excluding
    .generateSet6
  )
  ## seguir aquí 
  # TRAIN SETS #################################################################
  train.prob.matrix <- matrix(
    NA_real_, nrow = sum(nums.train), ncol = n.cell.types
  )
  colnames(train.prob.matrix) <- names(prob.list)
  train.plots <- list()
  n <- 1
  loc <- c(0, cumsum(nums.train))
  for (fun in seq_along(functions.list)) {
    if (nums.train[n] == 0) {
      n <- n + 1
      next
    }
    if (!fun %in% c(5, 6)) {
      index.ex <- .generateExcludedTypes(
        num = nums.train[n],
        s.cells = total.train,
        n.cell.types = n.cell.types,
        prob.zero = prob.sparsity
      )  
    } else {
      index.ex <- NULL
    }
    train.probs <- functions.list[[fun]](
      prob.list = prob.list,
      num = nums.train[n],
      s.cells = total.train,
      n.cell.types = n.cell.types,
      index.ex = index.ex
    )
    train.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- train.probs
    train.plots[[n]] <- .plotsQCSets(
      probs = train.probs,
      prob.matrix = train.prob.matrix,
      n = n,
      set = "train"
    )
    n <- n + 1
  }
  # check if matrix is correctly built
  if (is.null(dim(train.prob.matrix))) 
    train.prob.matrix <- matrix(train.prob.matrix, nrow = 1)
  if (is.null(colnames(train.prob.matrix)))
    colnames(train.prob.matrix) <- prob.design[, cell.type.column]
  
  rownames(train.prob.matrix) <- paste("Bulk", seq(dim(train.prob.matrix)[1]),
                                       sep = "_")
  train.plots <- train.plots[!unlist(lapply(train.plots, is.null))]
  train.method <- unlist(
    mapply(
      FUN = function(name.method, num.samp) rep(name.method, num.samp), 
      name.method = paste("Method", seq(1, length(nums.train))), 
      num.samp = nums.train
    )
  )
  names(train.method) <- paste0("Bulk_", seq(sum(nums.train)))
  if (verbose) {
    message("=== Probability matrix for training data:")
    message(paste(c("    - Bulk RNA-Seq samples:", "    - Cell types:"),
                  dim(train.prob.matrix),
                  collapse = "\n"), "\n")
  }
  
  # TEST SETS ##################################################################
  test.prob.matrix <-matrix(NA_real_, nrow = sum(nums.test), ncol = n.cell.types)
  test.plots <- list()
  n <- 1
  # hardcoded because the last function cannot receive prob.zero > 0
  loc <- c(0, cumsum(nums.test))
  for (fun in seq_along(functions.list)) {
    if (nums.test[n] == 0) {
      n <- n + 1
      next
    }
    if (!fun %in% c(5, 6)) {
      index.ex <- .generateExcludedTypes(
        num = nums.test[n],
        s.cells = total.test,
        n.cell.types = n.cell.types,
        prob.zero = prob.sparsity
      )  
    } else {
      index.ex <- NULL
    }
    test.probs <- functions.list[[fun]](
      prob.list = prob.list,
      num = nums.test[n],
      s.cells = total.test,
      n.cell.types = n.cell.types,
      index.ex = index.ex
    )
    test.prob.matrix[seq(loc[n] + 1, loc[n + 1]), ] <- test.probs
    test.plots[[n]] <- .plotsQCSets(
      probs = test.probs,
      prob.matrix = test.prob.matrix,
      n = n,
      set = "test"
    )
    n <- n + 1
  }
  # check if matrix is correctly built
  if (is.null(dim(test.prob.matrix))) 
    test.prob.matrix <- matrix(test.prob.matrix, nrow = 1)
  if (is.null(colnames(test.prob.matrix)))
    colnames(test.prob.matrix) <- prob.design[, cell.type.column]
  
  rownames(test.prob.matrix) <- paste("Bulk", seq(dim(test.prob.matrix)[1]),
                                      sep = "_")
  test.plots <- test.plots[!unlist(lapply(test.plots, is.null))]
  test.method <- unlist(
    mapply(
      FUN = function(name.method, num.samp) rep(name.method, num.samp), 
      name.method = paste("Method", seq(1, length(nums.test))) , 
      num.samp = nums.test
    )
  )
  names(test.method) <- paste0("Bulk_", seq(sum(nums.test)))
  if (verbose) {
    message("=== Probability matrix for test data:")
    message(paste(c("    - Bulk RNA-Seq samples:", "    - Cell types:"),
                  dim(test.prob.matrix),
                  collapse = "\n"), "\n")
  }
  # GENERATE PROBS MATRIX NAMES
  train.prob.matrix.names <- t(apply(
    X = train.prob.matrix,
    MARGIN = 1,
    FUN = setCount,
    setList = train.set.list,
    sn = colnames(train.prob.matrix),
    n.cells = n.cells
  ))
  test.prob.matrix.names <- t(apply(
    X = test.prob.matrix,
    MARGIN = 1,
    FUN = setCount,
    setList = test.set.list,
    sn = colnames(test.prob.matrix),
    n.cells = n.cells
  ))

  # generate object of ProbMatrixCellTypes class
  train.prob.matrix.object <- new(
    Class = "ProbMatrixCellTypes",
    prob.matrix = train.prob.matrix,
    cell.names = train.prob.matrix.names,
    set.list = train.set.list,
    set = train.set,
    method = train.method,
    plots = train.plots,
    type.data = "train"
  )
  test.prob.matrix.object <- new(
    Class = "ProbMatrixCellTypes",
    prob.matrix = test.prob.matrix,
    cell.names = test.prob.matrix.names,
    set.list = test.set.list,
    set = test.set,
    method = test.method,
    plots = test.plots,
    type.data = "test"
  )
  prob.cell.types(object) <- list(
    train = train.prob.matrix.object,
    test = test.prob.matrix.object
  )
  if (verbose) message("DONE")
  return(object)
}

.violinPlot <- function(
  df, 
  title, 
  x = "CellType", 
  y = "Prob"
) {
  plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
    geom_violin() + ggtitle(title) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  return(plot)
}

.boxPlot <- function(
  df, 
  title, 
  x = "CellType", 
  y = "Prob"
) {
  plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]])) +
    geom_boxplot() + ggtitle(title) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  return(plot)
}

.linesPlot <- function(
  df, 
  title, 
  x = "CellType", 
  y = "Prob", 
  group = "Sample"
) {
  plot <- ggplot(df, aes(x = .data[[x]], y = .data[[y]],
                        group = .data[[group]])) +
    geom_line(colour = "grey60") + ggtitle(title) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  return(plot)
}

.plotsQCSets <- function(
  probs, 
  prob.matrix, 
  n, 
  set
) {
  title <- paste0("Bulk Probability Dist. Set ", n, " (", set, ")")
  n.samples <- paste("# samples:", dim(probs)[1])
  plots.functions <- list(.violinPlot, .boxPlot, .linesPlot)
  df <- reshape2::melt(probs)
  colnames(df) <- c("Sample", "CellType", "Prob")
  # first three plots
  plot.list <- lapply(plots.functions, function(f) f(df, paste(title, n.samples)))
  # final plots
  dummy <- t(apply(as.matrix(stats::na.omit(prob.matrix)), 1, sort, decreasing = T))
  df <- reshape2::melt(dummy)
  colnames(df) <- c("Sample", "nCellTypes", "Prob")
  df$nCellTypes <- factor(df$nCellTypes)
  plot.list[[4]] <- .boxPlot(df = df, x = "nCellTypes", title = title)
  names(plot.list) <- c("violinplot", "boxplot", "linesplot", "ncelltypes")
  return(plot.list)
}

# there is something wrong
setCount <- function(
  x, 
  setList, 
  sn, 
  n.cells
) {
  names(x) <- sn
  sc <- c()
  x.set <- .setHundredLimit(x = (x * n.cells) / 100, limit = n.cells)
  for (cType in names(x)) {
    n <- ceiling(x.set[cType]) # n <- ceiling(x[cType])
    if (n > 0) {
      if (is.null(setList[[cType]]))
        next
      repl <- ifelse(test = n > length(setList[[cType]]), yes = TRUE, no = FALSE)
      sc <- c(sc, sample(setList[[cType]], size = n, replace = repl))
    }
  }
  return(sc[seq(n.cells)])
}

# introduce zeros in selected proportions. when these samples will be 
# simulated, they will have zero values in these cell types
.cellExcluder <- function(vec, index.ex) {
  # sel <- sample(x = index.ex, size = length(index.ex)) # -1
  vec[index.ex] <- 0
  return(list(vec, index.ex))
}

# recursive implementation
# this function take 1 cell type from the available cell types (without 
# using index.ex cell types) and add up or subtract depending what is needed
.setHundredLimit <- function(
  x, 
  index.ex = NULL, 
  limit = 100
) {
  if (sum(x) > limit) {
    while (TRUE) {
      if (is.null(index.ex)) {
        sel <- sample(x = seq(length(x)), size = 1)
      } else {
        if (length(index.ex) != length(x) - 1) {
          sel <- sample(x = which(x != 0), size = 1)
        } else {
          sel <- sample(x = seq(length(x))[-index.ex], size = 1) 
        }
      }
      res <- x[sel] - abs(sum(x) - limit)
      if (res < 0) res <- x[sel] - sample(x = x[sel], size = 1)
      if (res >= 0) break
    }
    x[sel] <- res
  } else if (sum(x) < limit) {
    while (TRUE) {
      if (is.null(index.ex)) {
        sel <- sample(x = seq(length(x)), size = 1)
      } else {
        sel <- sample(x = seq(length(x))[-index.ex], size = 1)
      }
      res <- x[sel] + abs(sum(x) - limit)
      if (res <= limit) break
    }
    x[sel] <- res
  }
  if (sum(x) != limit) 
    return(.setHundredLimit(x = x, index.ex = index.ex, limit = limit))
  else 
    return(x)
}

# This function is not intended to deal with index.ex, so I'm going to avoid
# it. Proportions will be corrected by .setHundredLimit. The idea is to 
# distribute the proportions equally between the different cell types
.adjustHundred <- function(
  x,
  prob.list,
  index.ex = NULL,
  sampling = TRUE
) {
  w <- unlist(lapply(prob.list, sample, 1))
  # remove cell types if needed
  if (!is.null(index.ex)) {
    x.list <- .cellExcluder(vec = x, index.ex = index.ex)
    x <- x.list[[1]]
    # w[x.list[[2]]] <- 0
    # # this sentence is in order to avoid that there is only one sample when they
    # # should be two
    # if (any(w[-x.list[[2]]] == 0)) {
    #   no.zero <- setdiff(seq_along(w), x.list)
    #   w[no.zero[w[no.zero] == 0]] <- 1
    # }
  }
  d <- abs(sum(x) - 100)
  if (sum(x) > 100) {
    div.w <- (w / sum(w)) * d
    while (!all(x >= div.w)) {
      index <- which(!x >= div.w)
      w[index] <- 0
      div.w <- (w / sum(w)) * d
    }
    x <- round(x - div.w)
    x <- .setHundredLimit(x = x, index.ex = index.ex)
  } else if (sum(x) < 100) {
    div.w <- (w / sum(w)) * d
    while (!all(100 - div.w > x)) {
      index <- which(!100 - div.w > x)
      w[index] <- 0
      div.w <- (w / sum(w)) * d
    }
    x <- round(x + div.w)
    x <- .setHundredLimit(x = x, index.ex = index.ex)
  }
  return(x)
}


# generate index.ex variable for each sample using prob.zero
.generateExcludedTypes <- function(
  num,
  s.cells,
  n.cell.types,
  prob.zero
) {
  if (prob.zero > 1 || prob.zero < 0)
    stop("'prob.zero' must be an integer between 0 and 1")
  prob.zero <- 1 - prob.zero
  # number of cell types equal to zero have the same probability: only changes
  # the probability of having 0 zeros. Maybe this could be improvable
  random.zeros <- function(prob.zero) {
    num.zero <- sample(
      seq(0, n.cell.types - 1), size = 1,
      prob = c(prob.zero, rep((1 - prob.zero) / (n.cell.types - 1), n.cell.types - 1))
    )
    # num.zero <- sample(x = seq(1, n.cell.types - 1), size = 1)
    if (num.zero == 0) {
      return(NULL) 
    } else {
      res <- sample(seq(n.cell.types), size = num.zero)
      if (length(res) == n.cell.types) 
        res <- res[-sample(x = n.cell.types, size = 1)]
    }
    return(res) 
  }
  return(
    replicate(
      n = num,
      expr = random.zeros(prob.zero = prob.zero)
    )
  )
}

# cell proportions are randomly sampled from a truncated uniform distribution 
# with predefined limits according to the a priori knowledge of the abundance 
# of each cell type
.generateSet1 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  sampling <- function(prob.list, ex.cells) {
    return(
      .cellExcluder(
        vec = unlist(lapply(X = prob.list, FUN = sample, 1)),
        index.ex = ex.cells
      )[[1]]
    )
  }
  # preallocate matrix in order to avoid memory problems
  prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
  c <- 1
  while (c <= num) {
    prob.matrix[c,] <- sampling(prob.list = prob.list, ex.cells = index.ex[[c]])
    c <- c + 1
  }
  prob.matrix <- round(prob.matrix * 100 / rowSums(prob.matrix))
  prob.matrix <- do.call(
    what = rbind, 
    args = lapply(
      X = seq(nrow(prob.matrix)), 
      FUN = function(x) {
        .setHundredLimit(x = prob.matrix[x, ], index.ex = index.ex[[x]])
      }
    )
  )
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

# generated by randomly permuting cell type labels on the previous proportions
# (.generateSet1)
.generateSet2 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  # First, I correct proportions: cell types with zero are not going to be 
  # changed, but in the next line I resample labels
  prob.matrix <- .generateSet1(
    prob.list = prob.list,
    num = num,
    s.cells = s.cells,
    n.cell.types = n.cell.types,
    index.ex = index.ex
  )
  prob.matrix <- t(apply(X = prob.matrix, MARGIN = 1, FUN = sample))
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

.generateSet3 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  # preallocate matrix in order to avoid memory problems
  prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
  c <- 1
  while (c <= num) {
    p <- rep(0, n.cell.types)
    # To avoid a bias with first cell types, I use sample
    i <- sample(
      x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]),
      size = 1
    )
    while (sum(p) < 100) {
      p[i] <- p[i] + sample(x = seq((100 - sum(p))), size = 1) #  / n.cell.types
      i <- sample(
        x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]), # c(index.ex[[c]], i)
        size = 1
      )
    }
    p <- round(p * 100 / sum(p))
    p <- .adjustHundred(x = p, prob.list = prob.list, index.ex = index.ex[[c]])
    prob.matrix[c, ] <- p
    # counter
    c <- c + 1
  }
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

.generateSet4 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  prob.matrix <- matrix(NA_real_, nrow = num, ncol = n.cell.types)
  c <- 1
  while(c <= num) {
    p <- rep(0, n.cell.types)
    names(p) <- names(prob.list)
    i <- sample(
      x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]), 
      size = 1
    )
    while (sum(p) < 100) {
      dp <- 101
      while (dp > max(prob.list[[i]])) {
        dp <- sample(x = prob.list[[i]], size = 1)
      }
      p[i] <- p[i] + dp
      i <- sample(
        x = setdiff(x = seq(n.cell.types), y = index.ex[[c]]), 
        size = 1
      )
    }
    prob.matrix[c, ] <- p
    c <- c + 1
  }
  prob.matrix <- round(prob.matrix * 100 / rowSums(prob.matrix))
  prob.matrix <- do.call(
    what = rbind, 
    args = lapply(
      X = seq(nrow(prob.matrix)), 
      FUN = function(x) {
        p <- .adjustHundred(
          x = prob.matrix[x, ], prob.list = prob.list, index.ex = index.ex[[x]]
        )
        return(sample(p))
      }
    )
  )
  # p <- .adjustHundred(x = p, prob.list = prob.list, index.ex = index.ex[[c]])
  # p <- sample(p)
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

## generate spots based on a Dirichlet distribution: this is the function 
# I use in the SpatialDDLS R package
.generateSet5 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  prob.matrix <- do.call(
    what = rbind, 
    args = lapply(
      X = seq(num), 
      FUN = function(x) {
        p <- gtools::rdirichlet(
          n = 1,
          alpha = .cellExcluder(
            rep(1, n.cell.types), index.ex = index.ex[[x]]
          )[[1]]
        )
        p <- round(p * 100 / sum(p))
        return(.setHundredLimit(x = p, index.ex = index.ex[[x]]))
      }
    )
  )
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

# generate pseudo-bulk samples consisted of only 1 cell type
.generateSet6 <- function(
  prob.list,
  num,
  s.cells,
  n.cell.types,
  index.ex
) {
  prob.matrix <- matrix(0, nrow = num, ncol = n.cell.types)
  num.set <- .setHundredLimit(
    x = rep(ceiling(num / n.cell.types), n.cell.types), limit = num
  )
  num.index <- c(0, cumsum(num.set))
  # which(num.set != 0) because may be cell types without pseudo-bulk --> error
  for (i in which(num.set != 0)) {
    prob.matrix[seq(num.index[i] + 1, num.index[i + 1]), i] <- 100
  }
  colnames(prob.matrix) <- names(prob.list)
  return(prob.matrix)
}

################################################################################
######################## Simulate bulk RNA-Seq samples #########################
################################################################################

#' Simulate training and test pseudo-bulk RNA-Seq profiles
#'
#' Simulate training and test pseudo-bulk RNA-Seq profiles using the cell
#' composition matrices generated by the \code{\link{generateBulkCellMatrix}}
#' function. The samples are generated under the assumption that the expression
#' level of the \eqn{i} gene in the \eqn{j} bulk sample is given by the sum of
#' the expression levels of the cell types \eqn{X_{ijk}} that make them up
#' weighted by the proportions of these \eqn{k} cell types in each sample. In
#' practice, as described in Torroja and Sanchez-Cabo, 2019, these profiles are
#' generated by summing a number of cells of different cell types determined by
#' proportions from a matrix of known cell composition. The number of simulated
#' pseudo-bulk RNA-Seq samples and the number of cells composing each sample are
#' determined by \code{\link{generateBulkCellMatrix}} (see Documentation)
#' \strong{Note:} this step can be avoided by using the \code{on.the.fly}
#' argument in the \code{\link{trainDDLSModel}} function. See
#' Documentation for more information.
#'
#' \pkg{digitalDLSorteR} allows the use of HDF5 files as back-end to store the
#' resulting data using the \pkg{DelayedArray} and \pkg{HDF5Array} packages.
#' This functionality allows to work without keeping the data loaded into RAM,
#' which could be of vital importance during some computationally heavy steps
#' such as neural network training on RAM-limited machines. You must provide a
#' valid file path in the \code{file.backend} argument to store the resulting
#' file with the '.h5' extension. The data will be accessible from R without
#' being loaded into memory. This option slightly slows down execution times, as
#' subsequent transformations of the data will be done in blocks rather than
#' using all the data. We recommend this option according to the computational
#' resources available and the number of pseudo-bulk samples to be generated.
#'
#' Note that if you use the \code{file.backend} argument with
#' \code{block.processing = FALSE}, all pseudo-bulk profiles will be simulated
#' in one step and, therefore, loaded into RAM. Then, the data will be written
#' to an HDF5 file. To avoid the RAM collapse, pseudo-bulk profiles can be
#' simulated and written to HDF5 files in blocks of \code{block.size} size by
#' setting \code{block.processing = TRUE}.
#'
#' It is possible to avoid this step by using the \code{on.the.fly} argument in
#' the \code{\link{trainDDLSModel}} function. In this way, data is
#' generated 'on the fly' during the neural network training. For more details,
#' see \code{?\link{trainDDLSModel}}.
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with
#'   \code{single.cell.real}/\code{single.cell.simul} and \code{prob.cell.types}
#'   slots.
#' @param type.data Type of data to generate between \code{'train'},
#'   \code{'test'} or \code{'both'} (the last by default).
#' @param pseudobulk.function Function used to build pseudo-bulk samples. It may
#'   be: \itemize{ \item \code{"MeanCPM"}: single-cell profiles (raw counts) are
#'   transformed into CPMs and cross-cell averages are calculated. Then,
#'   \code{log2(CPM + 1)} is calculated. \item \code{"AddCPM"}: single-cell
#'   profiles (raw counts) are transformed into CPMs and are added up across
#'   cells. Then, log-CPMs are calculated. \item \code{"AddRawCount"}:
#'   single-cell profiles (raw counts) are added up across cells. Then, log-CPMs
#'   are calculated.}
#' @param file.backend Valid file path to store the simulated single-cell
#'   expression profiles as an HDF5 file (\code{NULL} by default). If provided,
#'   the data is stored in HDF5 files used as back-end by using the
#'   \pkg{DelayedArray}, \pkg{HDF5Array} and \pkg{rhdf5} packages instead of
#'   loading all data into RAM memory. This is suitable for situations where you
#'   have large amounts of data that cannot be loaded into memory. Note that
#'   operations on this data will be performed in blocks (i.e subsets of
#'   determined size) which may result in longer execution times.
#' @param compression.level The compression level used if \code{file.backend} is
#'   provided. It is an integer value between 0 (no compression) and 9 (highest
#'   and slowest compression). See
#'   \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} from the
#'   \pkg{HDF5Array} package for more information.
#' @param block.processing Boolean indicating whether the data should be
#'   simulated in blocks (only if \code{file.backend} is used, \code{FALSE} by
#'   default). This functionality is suitable for cases where is not possible to
#'   load all data into memory and it leads to larger execution times.
#' @param block.size Only if \code{block.processing = TRUE}. Number of
#'   pseudo-bulk expression profiles that will be simulated in each iteration
#'   during the process. Larger numbers result in higher memory usage but
#'   shorter execution times. Set according to available computational resources
#'   (1000 by default).
#' @param chunk.dims Specifies the dimensions that HDF5 chunk will have. If
#'   \code{NULL}, the default value is a vector of two items: the number of
#'   genes considered by \code{\linkS4class{DigitalDLSorter}} object during the
#'   simulation, and a single sample to reduce read times in the following
#'   steps. A larger number of columns written in each chunk can lead to longer
#'   read times.
#' @param threads Number of threads used during the simulation of pseudo-bulk
#'   samples (1 by default). Set according to computational resources and avoid
#'   it if \code{block.size} will be used.
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#'   default).
#'
#' @return A \code{\linkS4class{DigitalDLSorter}} object with \code{bulk.simul}
#'   slot containing a list with one or two entries (depending on selected
#'   \code{type.data} argument): \code{'train'} and \code{'test'}. Each entry
#'   contains a \code{SummarizedExperiment} object
#'   with simulated bulk samples in the \code{assay} slot, sample names in the
#'   \code{colData} slot and feature names in the \code{rowData} slot.
#'
#' @export
#'
#' @seealso \code{\link{generateBulkCellMatrix}}
#'   \code{\linkS4class{ProbMatrixCellTypes}}
#'   \code{\link{trainDDLSModel}}
#'
#' @examples
#' set.seed(123) # reproducibility
#' # simulated data
#' sce <- SingleCellExperiment::SingleCellExperiment(
#'   assays = list(
#'     counts = matrix(
#'       rpois(30, lambda = 5), nrow = 15, ncol = 10,
#'       dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#'     )
#'   ),
#'   colData = data.frame(
#'     Cell_ID = paste0("RHC", seq(10)),
#'     Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10,
#'                        replace = TRUE)
#'   ),
#'   rowData = data.frame(
#'     Gene_ID = paste0("Gene", seq(15))
#'   )
#' )
#' DDLS <- createDDLSobject(
#'   sc.data = sce,
#'   sc.cell.ID.column = "Cell_ID",
#'   sc.gene.ID.column = "Gene_ID",
#'   sc.filt.genes.cluster = FALSE, 
#'   sc.log.FC = FALSE
#' )
#' probMatrixValid <- data.frame(
#'   Cell_Type = paste0("CellType", seq(2)),
#'   from = c(1, 30),
#'   to = c(15, 70)
#' )
#' DDLS <- generateBulkCellMatrix(
#'   object = DDLS,
#'   cell.ID.column = "Cell_ID",
#'   cell.type.column = "Cell_Type",
#'   prob.design = probMatrixValid,
#'   num.bulk.samples = 10,
#'   verbose = TRUE
#' )
#' DDLS <- simBulkProfiles(DDLS, verbose = TRUE)
#'
#' @references Fischer B, Smith M and Pau, G (2020). rhdf5: R Interface to HDF5.
#'   R package version 2.34.0.
#'
#'   Pagès H, Hickey P and Lun A (2020). DelayedArray: A unified framework for
#'   working transparently with on-disk and in-memory array-like datasets. R
#'   package version 0.16.0.
#'
#'   Pagès H (2020). HDF5Array: HDF5 backend for DelayedArray objects. R package
#'   version 1.18.0.
#'   
simBulkProfiles <- function(
  object,
  type.data = "both",
  pseudobulk.function = "AddRawCount",
  file.backend = NULL,
  compression.level = NULL,
  block.processing = FALSE,
  block.size = 1000,
  chunk.dims = NULL,
  threads = 1,
  verbose = TRUE
) {
  if (!is(object, "DigitalDLSorter")) {
    stop("The object provided is not of DigitalDLSorter class")
  } else if (is.null(single.cell.real(object))) {
    stop("There are no real single-cell profiles in DigitalDLSorter object")
  } else if (is.null(prob.cell.types(object))) {
    stop("'prob.cell.types' slot is empty")
  } else if (!any(type.data == c("train", "test", "both"))) {
    stop("'type.data' argument must be one of the following options: 'train', 'test' or 'both'")
  }
  if (!pseudobulk.function %in% c("MeanCPM", "AddCPM", "AddRawCount")) {
    stop("'pseudobulk.function' must be one of the following options: 'MeanCPM', 'AddCPM', 'AddRawCount'")
  } else {
    if (pseudobulk.function == "MeanCPM") {
      .pseudobulk.fun <- pseudobulk.fun.mean.cpm
    } else if (pseudobulk.function == "AddCPM") {
      .pseudobulk.fun <- pseudobulk.fun.add.cpm
    } else if (pseudobulk.function == "AddRawCount") {
      .pseudobulk.fun <- pseudobulk.fun.add.raw.counts
    }
  }
  if (!is.null(file.backend)) {
    if (!requireNamespace("DelayedArray", quietly = TRUE) || 
        !requireNamespace("HDF5Array", quietly = TRUE)) {
      stop("digitalDLSorteR provides the possibility of using HDF5 files as back-end
         when data are too big to be located in RAM. It uses DelayedArray, 
         HDF5Array and rhdf5 to do it. Please install both packages to 
         use this functionality")
    } 
    if (file.exists(file.backend)) {
      stop("'file.backend' already exists. Please provide a correct file path")
    }
    if (is.null(compression.level)) {
      compression.level <- HDF5Array::getHDF5DumpCompressionLevel()
    } else {
      if (compression.level < 0 || compression.level > 9) {
        stop("'compression.level' must be an integer between 0 (no compression) ",
             "and 9 (highest and slowest compression). ")
      }
    }
  }
  if (threads <= 0) threads <- 1
  if (verbose) {
    message(paste("=== Setting parallel environment to", threads, "thread(s)"))
  }
  if (type.data == "both") {
    if (!is.null(object@bulk.simul)) {
      warning("'bulk.simul' slot will be overwritten\n\n",
              call. = FALSE, immediate. = TRUE)
    }
    bulk.counts <- lapply(
      X = c("train", "test"),
      FUN = function(x) {
        if (verbose) {
          message(paste("\n=== Generating", x, "bulk samples:"))
        }
        .generateBulkProfiles(
          object = object,
          type.data = x,
          fun.pseudobulk = .pseudobulk.fun,
          unit = pseudobulk.function,
          file.backend = file.backend,
          compression.level = compression.level,
          block.processing = block.processing,
          block.size = block.size,
          chunk.dims = chunk.dims,
          threads = threads,
          verbose = verbose
        )
      }
    )
    names(bulk.counts) <- c("train", "test")
    object@bulk.simul <- bulk.counts
  } else {
    if (!is.null(bulk.simul(object)) && type.data %in% names(bulk.simul(object))) {
      warning(paste(type.data, "data in 'bulk.simul' slot will be overwritten", 
                    "\n\n"),
              call. = FALSE, immediate. = TRUE)
    }
    if (verbose) {
      message(paste("\n=== Generating", type.data, "bulk samples:"))
    }
    bulk.counts <- .generateBulkProfiles(
      object = object,
      type.data = type.data,
      fun.pseudobulk = .pseudobulk.fun,
      unit = pseudobulk.function,
      file.backend = file.backend,
      compression.level = compression.level,
      block.processing = block.processing,
      block.size = block.size,
      chunk.dims = chunk.dims,
      threads = threads,
      verbose = verbose
    )
    if (!is.null(bulk.simul(object))) {
      if (type.data %in% names(bulk.simul(object))) {
        bulk.simul(object, type.data) <- NULL
      }
      bulk.simul(object) <- c(bulk.simul(object), type.data = bulk.counts)
    } else {
      bulk.simul(object) <- list(bulk.counts)
      names(bulk.simul(object)) <- type.data
    }
  }
  if (verbose) message("\nDONE")
  return(object)
}

.generateBulkProfiles <- function(
  object,
  type.data,
  fun.pseudobulk,
  unit,
  file.backend,
  compression.level,
  block.processing,
  block.size,
  chunk.dims,
  threads,
  verbose
) {
  sel.bulk.cells <- prob.cell.types(object, type.data) %>% cell.names()
  sel.bulk.cells <- sel.bulk.cells[sample(nrow(sel.bulk.cells)), ]
  if (!is.null(single.cell.simul(object))) {
    suffix.names <- unique(colData(single.cell.simul(object))$suffix)
  } else {
    suffix.names <- "_Simul"
  }
  pattern <- suffix.names
  if (!verbose) pbo <- pbapply::pboptions(type = "none")
  if (block.processing && is.null(file.backend)) {
    stop("'block.processing' is only compatible to the use of HDF5 files ", 
         "as back-end ('file.backend' argument)")
  } else if (block.processing && !is.null(file.backend)) {
    n <- nrow(sel.bulk.cells)
    J <- nrow(assay(single.cell.real(object)))
    if (n < block.size) {
      block.size <- n
      warning("The number of simulated samples is less than 'block.size'. ",
              "A single block will be performed.", 
              call. = FALSE, immediate. = TRUE)
    }
    if (is.null(chunk.dims) || length(chunk.dims) != 2) chunk.dims <- c(J, 1)
    if (!file.exists(file.backend)) rhdf5::h5createFile(file.backend)
    rhdf5::h5createDataset(
      file.backend, type.data, 
      dims = c(J, block.size), 
      maxdims = c(J, n), 
      chunk = chunk.dims,
      storage.mode = "double"
    )
    r.i <- 0
    # iteration over cells 
    for (iter in seq(ceiling(n / block.size))) {
      if (verbose) message(paste("   - Writing block", iter))
      if ((block.size * iter) - n > 0) {
        dif <- block.size
        dif.2 <- (block.size * iter) - n
        block.size <- block.size - dif.2
      } else {
        dif <- block.size
      }
      sub.i <- seq(from = r.i + 1, to = r.i + block.size)
      r.i <- r.i + block.size
      bulk.samples <- pbapply::pbapply(
        X = sel.bulk.cells[sub.i, , drop = FALSE],
        MARGIN = 1,
        FUN = .setBulk,
        object = object,
        pattern = pattern,
        fun.pseudobulk = fun.pseudobulk,
        cl = threads
      )
      if (iter == 1) {
        rhdf5::h5write(
          obj = bulk.samples, file = file.backend, 
          name = type.data, level = compression.level
        )
      } else {
        # check number of cells in the next loop
        rhdf5::h5set_extent(file.backend, type.data, dims = c(J, n))
        rhdf5::h5write(
          obj = bulk.samples, 
          file = file.backend, name = type.data, 
          index = list(seq(J), seq((dif * (iter - 1)) + 1, 
                                   (dif * (iter - 1)) + ncol(bulk.samples))),
          level = compression.level
        )
      }
    }
    rhdf5::H5close()
    # HDF5Array object for SingleCellExperiment class
    bulk.samples <- HDF5Array::HDF5Array(file = file.backend, name = type.data)
    dimnames(bulk.samples) <- list(rownames(assay(single.cell.real(object))), 
                                   rownames(sel.bulk.cells))
  } else if (!block.processing) {
    bulk.samples <- pbapply::pbapply(
      X = sel.bulk.cells,
      MARGIN = 1,
      FUN = .setBulk,
      object = object,
      pattern = pattern,
      fun.pseudobulk = fun.pseudobulk,
      cl = threads
    )
  }
  if (!verbose) on.exit(pbapply::pboptions(pbo))
  return(
    .createSEObject(
      counts = bulk.samples,
      samples.metadata = prob.cell.types(
        object, type.data
      )@prob.matrix[rownames(sel.bulk.cells), ],
      genes.metadata = rownames(assay(single.cell.real(object))),
      mixing.fun = unit,
      file.backend = file.backend,
      compression.level = compression.level,
      block.processing = block.processing,
      type.data = type.data,
      verbose = verbose
    )
  )
}

.createSEObject <- function(
  counts, 
  samples.metadata, 
  genes.metadata,
  mixing.fun,
  file.backend,
  compression.level,
  block.processing,
  type.data,
  verbose
) {
  # could be a check of counts class -> if (is(counts, "HDF5Array"))
  if (!is.null(file.backend) && !block.processing) {
    counts <- .useH5backend(
      counts = counts,
      file.backend = file.backend,
      compression.level = compression.level,
      group = type.data,
      sparse = FALSE,
      verbose = verbose
    )
  }
  return(
    SummarizedExperiment::SummarizedExperiment(
      assays = list(counts = counts),
      colData = samples.metadata,
      rowData = genes.metadata,
      metadata = list(mixing.fun = mixing.fun)
    )
  )
}

## functions to calculate pseudo-bulk samples
.cpmCalculate <- function(x) {
  if (is.null(dim(x))) return((x / sum(x)) * 1e6)
  else return(apply(X = x, MARGIN = 2, FUN = function(x) (x / sum(x)) * 1e6))
}

pseudobulk.fun.mean.cpm <- function(x) {
  rowMeans(log2(.cpmCalculate(x = x + 1)))
}

pseudobulk.fun.add.cpm <- function(x) {
  .cpmCalculate(x = rowSums(log2(.cpmCalculate(x = x + 1))))
}

pseudobulk.fun.add.raw.counts.cpm <- function(x) {
  log2(.cpmCalculate(x = rowSums(x) + 1))
}

pseudobulk.fun.add.raw.counts <- function(x) {
  rowSums(x)
}

.setBulk <- function(x, object, pattern, fun.pseudobulk) {
  sep.b <- grepl(pattern = pattern, x = x)
  if (any(sep.b)) {
    cols.sim <- match(
      x = x[sep.b], table = colnames(single.cell.simul(object))
    ) %>% sort()
    cols.real <- match(
      x = x[!sep.b], table = colnames(single.cell.real(object))
    ) %>% sort()
    sim.counts <- as.matrix(assay(single.cell.simul(object))[, cols.sim, drop = FALSE])
    real.counts <- as.matrix(assay(single.cell.real(object))[, cols.real, drop = FALSE])
    counts <- .mergeMatrices(x = real.counts, y = sim.counts) # merge matrices
  } else if (all(sep.b)) {
    cols <- sort(match(x = x[sep.b], table = colnames(single.cell.simul(object)))) 
    counts <- as.matrix(assay(single.cell.simul(object))[, cols, drop = FALSE])
  } else {
    cols <- match(x = x, table = colnames(single.cell.real(object))) %>% sort()
    counts <- as.matrix(assay(single.cell.real(object))[, cols, drop = FALSE])
  }
  return(fun.pseudobulk(x = counts))
}
diegommcc/digitalDLSorteR_dev documentation built on Nov. 3, 2024, 3:56 a.m.