R/simSingleCell.R

Defines functions simSCProfiles .checkHDF5parameters .subsetCells .checkNumCellTypes .checkSlot .reduceDataset .zinbWaveModel estimateZinbwaveParams

Documented in estimateZinbwaveParams simSCProfiles

#' @importFrom stats model.matrix rnbinom rbinom as.formula
#'
NULL

################################################################################
######################### Estimate Zinbwave parameters #########################
################################################################################

#' Estimate the parameters of the ZINB-WaVE model to simulate new single-cell
#' RNA-Seq expression profiles
#'
#' Estimate the parameters of the ZINB-WaVE model using a real single-cell
#' RNA-Seq data set as reference to simulate new single-cell profiles and
#' increase the signal of underrepresented cell types. This step is optional,
#' only is needed if the size of you dataset is too small or there are
#' underrepresented cell types in order to train the Deep Neural Network model
#' in a more balanced way. After this step, the \code{\link{simSCProfiles}}
#' function will use the estimated parameters to simulate new single-cell
#' profiles. See \code{?\link{simSCProfiles}} for more information.
#'
#' ZINB-WaVE is a flexible model for zero-inflated count data. This function
#' carries out the model fit to real single-cell data modeling \eqn{Y_{ij}} (the
#' count of feature \eqn{j} for sample \eqn{i}) as a random variable following a
#' zero-inflated negative binomial (ZINB) distribution. The estimated parameters
#' will be used for the simulation of new single-cell expression profiles by
#' sampling a negative binomial distribution and inserting dropouts from a
#' binomial distribution. To do so, \pkg{digitalDLSorteR} uses the
#' \code{\link[zinbwave]{zinbFit}} function from the \pkg{zinbwave} package
#' (Risso et al., 2018). For more details about the model, see Risso et al.,
#' 2018.
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with a
#'   \code{single.cell.real} slot.
#' @param cell.type.column Name or column number corresponding to the cell type
#'   of each cell in cells metadata.
#' @param cell.ID.column Name or column number corresponding to the cell names
#'   of expression matrix in cells metadata.
#' @param gene.ID.column Name or column number corresponding to the notation
#'   used for features/genes in genes metadata.
#' @param cell.cov.columns Name or column number(s) in cells metadata to be used
#'   as covariates during model fitting (if no covariates are used, set to empty
#'   or \code{NULL}).
#' @param gene.cov.columns Name or column number(s) in genes metadata that will
#'   be used as covariates during model fitting (if no covariates are used, set
#'   to empty or \code{NULL}).
#' @param subset.cells Number of cells to fit the ZINB-WaVE model. Useful when
#'   the original data set is too large to fit the model. Set a value according
#'   to the original data set and the resources available on your computer. If
#'   \code{NULL} (by default), all cells will be used. Must be an integer
#'   greater than or equal to the number of cell types considered and less than
#'   or equal to the total number of cells.
#' @param proportional If \code{TRUE}, the original cell type proportions in the
#'   subset of cells generated by \code{subset.cells} will not be altered as far
#'   as possible. If \code{FALSE}, all cell types will have the same number of
#'   cells as far as possible (\code{TRUE} by default).
#' @param set.type Cell type(s) to evaluate (\code{'All'} by default). It is
#'   recommended fitting the model to all cell types rather than using only a
#'   subset of them to capture the total variability present in the original
#'   experiment even if only a subset of cell types is simulated.
#' @param threads Number of threads used for estimation (1 by default). To set
#'   up the parallel environment, the \pkg{BiocParallel} package must be
#'   installed.
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#'   default).
#' @return A \code{\linkS4class{DigitalDLSorter}} object with \code{zinb.params}
#'   slot containing a \code{\linkS4class{ZinbParametersModel}} object. This
#'   object contains a slot with the estimated ZINB-WaVE parameters from the
#'   real single-cell RNA-Se`q data.
#'
#' @export
#'
#' @seealso \code{\link{simSCProfiles}}
#'
#' @examples
#' set.seed(123) # reproducibility
#' sce <- SingleCellExperiment::SingleCellExperiment(
#'   assays = list(
#'     counts = matrix(
#'       rpois(30, lambda = 5), nrow = 15, ncol = 10,
#'       dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#'     )
#'   ),
#'   colData = data.frame(
#'     Cell_ID = paste0("RHC", seq(10)),
#'     Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10,
#'                        replace = TRUE)
#'   ),
#'   rowData = data.frame(
#'     Gene_ID = paste0("Gene", seq(15))
#'   )
#' )
#' DDLS <- loadSCProfiles(
#'   single.cell.data = sce,
#'   cell.ID.column = "Cell_ID",
#'   gene.ID.column = "Gene_ID"
#' )
#' DDLS <- estimateZinbwaveParams(
#'   object = DDLS,
#'   cell.type.column = "Cell_Type",
#'   cell.ID.column = "Cell_ID",
#'   gene.ID.column = "Gene_ID",
#'   subset.cells = 2,
#'   verbose = TRUE
#' )
#'
#' @references Risso, D., Perraudeau, F., Gribkova, S. et al. (2018). A general
#'   and flexible method for signal extraction from single-cell RNA-seq data.
#'   Nat Commun 9, 284. doi: \doi{10.1038/s41467-017-02554-5}.
#'
#'   Torroja, C. and Sánchez-Cabo, F. (2019). digitalDLSorter: A Deep Learning
#'   algorithm to quantify immune cell populations based on scRNA-Seq data.
#'   Frontiers in Genetics 10, 978. doi: \doi{10.3389/fgene.2019.00978}.
#'   
estimateZinbwaveParams <- function(
  object,
  cell.type.column,
  cell.ID.column,
  gene.ID.column,
  cell.cov.columns,
  gene.cov.columns,
  subset.cells = NULL,
  proportional = TRUE,
  set.type = "All",
  threads = 1,
  verbose = TRUE
) {
  if (!is(object, "DigitalDLSorter")) {
    stop("The object provided is not of DigitalDLSorter class")
  } else if (is.null(single.cell.real(object))) {
    stop("'single.cell.real' slot is empty")
  } else if (missing(cell.ID.column) || missing(gene.ID.column) || 
             is.null(cell.ID.column) || is.null(gene.ID.column)) {
    stop("'cell.ID.column' and 'gene.ID.column' arguments are needed. Please, ", 
         "look at ?estimateZinbwaveParams")
  } else if (missing(cell.type.column) || is.null(cell.type.column)) {
    stop("'cell.type.column' argument is needed. Please, look at ", 
         "?estimateZinbwaveParams")
  }
  if (!is.null(zinb.params(object = object))) {
    warning(
      "'zinb.params' slot already has a ZinbParametersModel object. Note that it will be overwritten\n",
      call. = FALSE, immediate. = TRUE
    )
  }
  # extract data from SCE to list
  list.data <- .extractDataFromSCE(
    SCEobject = single.cell.real(object),
    cell.ID.column = cell.ID.column,
    gene.ID.column = gene.ID.column,
    new.data = FALSE
  )
  # check if cell.ID.column and cell.type.column are correct
  mapply(
    FUN = function(x, y) {
      .checkColumn(
        metadata = list.data[[2]],
        ID.column = x,
        type.metadata = "cells.metadata",
        arg = y
      )  
    },
    x = c(cell.ID.column, cell.type.column),
    y = c("cell.ID.column", "cell.type.column")
  )
  # check if cell type variable has at least two levels
  if (length(unique(list.data[[2]][, cell.type.column])) < 2) 
    stop("'cell.type.column' must have 2 or more unique elements")  
  # if cell.cov.columns is provided, check if everything is correct
  if (!(missing(cell.cov.columns) || is.null(cell.cov.columns))) {
    lapply(
      X = cell.cov.columns, 
      FUN = function(x) {
        .checkColumn(
          metadata = list.data[[2]],
          ID.column = x,
          type.metadata = "cells.metadata",
          arg = "cell.cov.columns"
        )
      }
    )
    lapply(
      X = cell.cov.columns,
      FUN = function(x) {
        if (length(unique(list.data[[2]][, x])) < 2) 
          stop(x, " must have 2 or more unique elements")
      }
    )
  }
  # if gene.cov.columns is provided, check if everything is correct
  if (!(missing(gene.cov.columns) || is.null(gene.cov.columns))) {
    lapply(
      X = gene.cov.columns, 
      FUN = function(x) {
        .checkColumn(
          metadata = list.data[[3]],
          ID.column = x,
          type.metadata = "genes.metadata",
          arg = "gene.cov.columns"
        )
      }
    )
    lapply(
      gene.cov.columns,
      function(x) {
        if (length(unique(list.data[[3]][, x])) < 2) 
          stop(x, " must have 2 or more unique elements")
      }
    )
  }
  # check if gene.ID.column is correct
  lapply(
    X = gene.ID.column, 
    FUN = function(x) {
      .checkColumn(
        metadata = list.data[[3]],
        ID.column = x,
        type.metadata = "genes.metadata",
        arg = "gene.ID.column"
      )
    }
  )
  # set configuration of parallel computations 
  if (threads <= 0) threads <- 1
  if (threads > 1 && !requireNamespace("BiocParallel", quietly = TRUE)) {
    message("=== Setting parallel environment to 1 thread. BiocParallel package is not available\n")
  } else {
    if (verbose) {
      message("=== Setting parallel environment to ", threads, " thread(s)\n")
    }  
    switch(
      Sys.info()[['sysname']],
      Windows = {
        parallelEnv <- BiocParallel::SnowParam(
          workers = threads, type = "SOCK"
        )
      },
      Linux = {
        parallelEnv <- BiocParallel::MulticoreParam(workers = threads)
      },
      Darwin = {
        parallelEnv <- BiocParallel::MulticoreParam(workers = threads)
      },
      SunOS = {
        parallelEnv <- BiocParallel::MulticoreParam(workers = threads)
      },
      parallelEnv <- BiocParallel::MulticoreParam(workers = threads)
    )
  }
  # set cell types that will be estimated
  if (set.type == "All" || "All" %in% set.type) {
    if (missing(cell.cov.columns) || is.null(cell.cov.columns)) {
      formula.cell.model <- as.formula(
        paste("~", paste(cell.type.column, collapse = "+"))
      ) 
      if (verbose) {
        message("=== Estimating parameters for all cell types in the experiment\n")
        message(paste("=== Creating cell model matrix based on", 
                      cell.type.column, "columns:"))
        message("\t", formula.cell.model, "\n")
      }
    } else {
      formula.cell.model <- as.formula(
        paste("~", paste(c(cell.cov.columns, cell.type.column), collapse = "+"))
      )  
      if (verbose) {
        message("=== Estimating parameters for all cell types in the experiment\n")
        message(paste("=== Creating cell model matrix based on", 
                      paste(cell.cov.columns, collapse = ", "),
                      "and", cell.type.column, "columns:"))
        message("\t", formula.cell.model, "\n")
      }
    }
    # subset of cells
    if (!is.null(subset.cells)) {
      list.data <- .reduceDataset(
        subset.cells = subset.cells,
        list.data = list.data,
        cell.type.column = cell.type.column, 
        cell.ID.column = cell.ID.column, 
        gene.ID.column = gene.ID.column,
        proportional = proportional,
        verbose = verbose
      )  
    }
    sdm <- model.matrix(
      formula.cell.model,
      data = list.data[[2]][match(colnames(list.data[[1]]),
                                  list.data[[2]][, cell.ID.column]), , 
                            drop = FALSE]
    )
    sdm.ncol <- ncol(sdm)
    sdm.colnames <- colnames(sdm)
  } else {
    if (!all(set.type %in% unique(list.data[[2]][, cell.type.column])))
      stop("Cell type(s) provided in 'set.type' argument not found")
    # check if set.type contains at least two or more cell types
    if (length(set.type) <= 2) {
      stop("'set.type' must contain more than two different cell types in order 
           to estimate the parameters of the model") 
    }
    # formula with only specific cell types
    if (missing(cell.cov.columns) || is.null(cell.cov.columns)) {
      formula.cell.model <- as.formula(
        paste("~", paste(cell.type.column, collapse = "+"))
      )  
      if (verbose) {
        message("=== Estimating parameters for ", paste(set.type, collapse = ", "), 
                " cell type(s) from the experiment\n")
        message(paste("=== Creating cell model matrix based on", 
                      cell.type.column, "columns:"))
        message("\t", formula.cell.model, "\n")
      }
    } else {
      formula.cell.model <- as.formula(
        paste("~", paste(c(cell.cov.columns, cell.type.column), collapse = "+"))
      )  
      if (verbose) {
        message("=== Estimating parameters for ", paste(set.type, collapse = ", "), 
                " cell type(s) from the experiment\n")
        message(paste("=== Creating cell model matrix based on", 
                      paste(cell.cov.columns, collapse = ", "),
                      "and", cell.type.column, "columns:"))
        message("\t", formula.cell.model, "\n")
      }
    }
    cell.IDs <- list.data[[2]][which(list.data[[2]][, cell.type.column] == set.type),
                               cell.ID.column, drop = FALSE]
    list.data[[2]] <- list.data[[2]][cell.IDs, , drop = FALSE]
    list.data[[1]] <- list.data[[1]][, cell.IDs, drop = FALSE]
    # subset of cells
    if (!is.null(subset.cells)) {
      list.data <- .reduceDataset(
        subset.cells = subset.cells,
        list.data = list.data,
        cell.type.column = cell.type.column, 
        cell.ID.column = cell.ID.column, 
        gene.ID.column = gene.ID.column,
        proportional = proportional,
        verbose = verbose
      )  
    }
    sdm <- model.matrix(
      formula.cell.model,
      data = list.data[[2]][match(colnames(list.data[[1]]),
                                  list.data[[2]][, cell.ID.column]), , 
                            drop = FALSE]
    )
    sdm.ncol <- ncol(sdm)
    sdm.colnames <- colnames(sdm)
    # sdm <- NULL
    # sdm.ncol <- 1
    # sdm.colnames <- seq(1)
  }
  # covariates for genes
  if (missing(gene.cov.columns) || is.null(gene.cov.columns)) {
    if (verbose) 
      message("=== Creating gene model matrix without gene covariates\n")
    gdm <- model.matrix(
      ~ 1, data = list.data[[3]][match(rownames(list.data[[1]]),
                                       list.data[[3]][, gene.ID.column]), ,
                                 drop = FALSE]
    )
  } else {
    formula.gene.model <- as.formula(
      paste("~", paste(gene.cov.columns, collapse = "+"))
    )
    if (verbose) {
      message("=== Creating gene model matrix with ", 
              gene.cov.columns, " covariate(s)")
      message("\t", formula.gene.model, "\n")
    }
    gdm <- model.matrix(formula.gene.model,
                        data = list.data[[3]][match(
                          rownames(list.data[[1]]), 
                          list.data[[3]][, gene.ID.column]), , drop = FALSE])
  }
  rownames(gdm) <- rownames(list.data[[1]])
  if (verbose) {
    start_time <- Sys.time()
    message("=== Running estimation process ",
            paste("(Start time", format(start_time, "%X)"), "\n"))
  }
  zinbParamsObj <- .zinbWaveModel(
    matrix.counts = ceiling(as.matrix(list.data[[1]])), 
    BPPARAM = parallelEnv, 
    design.samples = sdm, 
    design.genes = gdm, 
    O_mu = matrix(
      0, nrow = ncol(list.data[[1]]), ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = seq(ncol(list.data[[1]])),
                      colnames = rownames(list.data[[1]]))
    ), 
    O_pi = matrix(
      0, nrow = ncol(list.data[[1]]), ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = seq(ncol(list.data[[1]])),
                      colnames = rownames(list.data[[1]]))
    ), 
    beta_mu = matrix(
      0, nrow = sdm.ncol, ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = sdm.colnames, 
                      colnames = rownames(list.data[[1]]))
    ), 
    beta_pi = matrix(
      0, nrow = sdm.ncol, ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = sdm.colnames,
                      colnames = rownames(list.data[[1]]))
    ), 
    alpha_mu = matrix(
      0, nrow = 0, ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = NULL, 
                      colnames = rownames(list.data[[1]]))
    ), 
    alpha_pi = matrix(
      0, nrow = 0, ncol = nrow(list.data[[1]]), 
      dimnames = list(rownames = NULL,
                      colnames = rownames(list.data[[1]]))
    ), 
    verbose = verbose
  )
  # update slots
  zinb.params(object) <- ZinbParametersModel(zinbwave.model = zinbParamsObj)
  if (verbose) {
    message("\nDONE")
    end_time <- Sys.time()
    message("\nInvested time: ", round(end_time - start_time, 2))
  }
  return(object)
}

.zinbWaveModel <- function(
  matrix.counts, 
  design.samples = NULL,
  design.genes = NULL, 
  common.disp = TRUE,
  iter.init = 2, 
  iter.opt = 25, 
  stop.opt = 1e-04,
  verbose = TRUE,
  BPPARAM = NULL, 
  ...
) {
  if (verbose) message("=== Removing genes without expression in any cell")
  matrix.counts <- matrix.counts[rowSums(matrix.counts) > 0, ]
  args.list <- list(
    Y = matrix.counts,
    commondispersion = common.disp,
    verbose = verbose,
    nb.repeat.initialize = iter.init,
    maxiter.optimize = iter.opt,
    stop.epsilon.optimize = stop.opt,
    BPPARAM = BPPARAM
  )
  if (!is.null(design.samples)) {
    args.list$X <- design.samples
  }
  if (!is.null(design.genes)) {
    args.list$V <- design.genes
  }
  args.list <- c(args.list, list(...))
  
  if (verbose) {message(">>> Fitting ZINB-WaVE model")}
  model <- do.call(zinbwave::zinbFit, args.list)
  return(model)
}

.reduceDataset <- function(
  subset.cells,
  list.data,
  cell.type.column, 
  cell.ID.column, 
  gene.ID.column,
  proportional,
  verbose
) {
  sub.list.data <- .subsetCells(
    counts = list.data[[1]], 
    cells.metadata = list.data[[2]], 
    cell.type.column = cell.type.column, 
    cell.ID.column = cell.ID.column, 
    total.subset = subset.cells,
    proportional = proportional,
    verbose = verbose
  )
  if (any(Matrix::rowSums(sub.list.data[[1]]) == 0)) {
    warning("There are some genes with zero expression in selected cells. ", 
            "Consider increasing the minimum expression level when loading ", 
            "data by loadSCProfiles function with 'min.counts' and ", 
            "'min.cells' arguments\n", call. = FALSE, immediate. = TRUE)
    list.data.filf <- .filterGenesSparse(
      counts = sub.list.data[[1]], 
      genes.metadata = list.data[[3]], 
      gene.ID.column = gene.ID.column, 
      min.counts = 0, 
      min.cells = 0, 
      verbose = verbose
    )
    list.data[[1]] <- list.data.filf[[1]]
    list.data[[3]] <- list.data.filf[[2]]
  } else {
    list.data[[1]] <- sub.list.data[[1]]
  }
  list.data[[2]] <- sub.list.data[[2]]
  return(list.data)
}

# function for check if slot is correct
.checkSlot <- function(object, slot) {
  ss <- eval(parse(text = paste0(slot, "(",
                   deparse(substitute(object)), ")")))
  if (is.null(ss)) {
    return(FALSE)
  } else {
    return(TRUE)
  }
}

.checkNumCellTypes <- function(
  num.cells, 
  total.subset,
  cells.metadata, 
  cell.type.column
) {
  num.cells.dataset <- table(cells.metadata[, cell.type.column])
  prop.final <- ifelse(
    test = num.cells.dataset < num.cells, 
    yes = num.cells.dataset, no = num.cells
  )
  available.cells <- num.cells.dataset - prop.final
  dif <- total.subset - sum(prop.final)
  if (sum(available.cells) >= dif) {
    while (dif != 0) {
      if (length(available.cells[available.cells > 0]) == 1) {
        pos <- available.cells[available.cells > 0]  
      } else {
        pos <- sample(available.cells[available.cells > 0], 1) 
      }
      if (dif - pos < 0) {
        name.pos <- names(pos)
        pos <- dif
        names(pos) <- name.pos
      }
      prop.final[names(pos)] <- prop.final[names(pos)] + pos
      available.cells <- num.cells.dataset - prop.final
      dif <- total.subset - sum(prop.final)
    }
    # avoid zero cell types: only works if there are enough cells
    if (any(prop.final == 0)) {
      pos <- which(prop.final == 0)
      prop.final[pos] <- prop.final[pos] + 1
      exc <- length(pos)
      while (exc != 0) {
        i <- sample(length(prop.final), 1)
        if (prop.final[i] > exc) {
          prop.final[i] <- prop.final[i] - exc
          exc <- 0
        } else if (prop.final[i] > 1) {
          prop.final[i] <- prop.final[i] - 1
          exc <- exc - 1
        }
      }
    }
  } else {
    message("Subsetting of cells is not available for this dataset")
  }
  return(prop.final)
}

# subset of data proportional or not to the original cell type proportions
.subsetCells <- function(
  counts, 
  cells.metadata, 
  cell.type.column, 
  cell.ID.column, 
  total.subset,
  proportional,
  verbose
) {
  if (ncol(counts) <= total.subset) {
    stop("'subset.cells' must be less than the total number of cells")
  } else if (total.subset < length(unique(cells.metadata[, cell.type.column]))) {
    stop("'subset.cells' must be greater than the number of cell types")
  }
  if (!proportional) {
    n.cell.types <- length(unique(cells.metadata[, cell.type.column]))
    proportions <- rep(100 / n.cell.types, n.cell.types)
  } else {
    proportions <- table(cells.metadata[, cell.type.column]) /
      length(cells.metadata[, cell.type.column]) * 100
  }
  # set number of cells for each cell type to upper limit
  nums.cells <- .setHundredLimit(
    x = ceiling((total.subset * proportions) / 100),
    limit = total.subset
  )
  names(nums.cells) <- levels(factor(cells.metadata[, cell.type.column]))
  # check if numbers are possible: number of cells of each cell type
  nums.cells <- .checkNumCellTypes(
    num.cells = nums.cells, 
    total.subset = total.subset, 
    cells.metadata = cells.metadata,
    cell.type.column = cell.type.column
  )
  if (any(nums.cells == 0))
    stop("Some cell types have zero cells. Please, provide a greater 'total.subset'")
  
  if (verbose) {
    message(
      "=== Number of cells for each cell type:\n",
      paste0("    - ", names(nums.cells), ": ", nums.cells, collapse = "\n"), 
      "\n"
    )
  }
  # random sampling of cells
  pos <- sapply(
    X = names(nums.cells), 
    FUN = function(x) {
      sample(
        which(cells.metadata[, cell.type.column] == x), size = nums.cells[x]
      )
    }
  ) %>% unlist(use.names = FALSE) %>% sort()
  
  counts <- counts[, pos]
  if (!is(counts, "dgCMatrix")) 
    counts <- Matrix::Matrix(as.matrix(counts), sparse = TRUE)
  
  return(list(counts, cells.metadata[pos, ]))
}


.checkHDF5parameters <- function(
  file.backend, 
  name.dataset.backend,
  compression.level
) {
  if (file.exists(file.backend)) {
    if (is.null(name.dataset.backend)) {
      warning("'file.backend' already exists. Using a random dataset name",
              call. = FALSE, immediate. = TRUE)
      name.dataset.backend <- HDF5Array::getHDF5DumpName(for.use = TRUE)
      while (strsplit(name.dataset.backend, 
                      split = "/")[[1]][2] %in% 
             rhdf5::h5ls(file.backend)[, "name"]) {
        name.dataset.backend <- .randomStr()
      }
    } else if (name.dataset.backend %in% rhdf5::h5ls(file.backend)[, "name"]) {
      stop("'file.backend' and 'name.dataset.backend' already exist. Please, ", 
           "introduce a correct file path or other dataset name")
    }
  } else {
    name.dataset.backend <- HDF5Array::getHDF5DumpName(for.use = TRUE)
  }
  # check if comression.level is valid
  if (is.null(compression.level)) {
    compression.level <- HDF5Array::getHDF5DumpCompressionLevel()
  } else {
    if (compression.level < 0 || compression.level > 9) {
      stop("'compression.level' must be an integer between 0 (no compression) ", 
           "and 9 (highest compression and slowest reading/writing). ")
    }
  }
  return(list(name.dataset.backend, compression.level))
}


################################################################################
######################## Simulate single-cell profiles #########################
################################################################################

#' Simulate new single-cell RNA-Seq expression profiles using the ZINB-WaVE
#' model parameters
#'
#' Simulate single-cell expression profiles by randomly sampling from a negative
#' binomial distribution and inserting dropouts by sampling from a binomial
#' distribution using the ZINB-WaVE parameters estimated by the
#' \code{\link{estimateZinbwaveParams}} function.
#'
#' Before this step, see \code{?\link{estimateZinbwaveParams}}. As described in
#' Torroja and Sanchez-Cabo, 2019, this function simulates a given number of
#' transcriptional profiles for each cell type provided by randomly sampling
#' from a negative binomial distribution with \eqn{\mu} and \eqn{\theta}
#' estimated parameters and inserting dropouts by sampling from a binomial
#' distribution with probability pi. All parameters are estimated from
#' single-cell real data using the \code{\link{estimateZinbwaveParams}}
#' function. It uses the ZINB-WaVE model (Risso et al., 2018). For more details
#' about the model, see \code{?\link{estimateZinbwaveParams}} and Risso et al.,
#' 2018.
#'
#' The \code{file.backend} argument allows to create a HDF5 file with simulated
#' single-cell profiles to be used as back-end to work with data stored on disk
#' instead of loaded into RAM. If the \code{file.backend} argument is used with
#' \code{block.processing = FALSE}, all the single-cell profiles will be
#' simulated in one step and, therefore, loaded into in RAM memory. Then, data
#' will be written in HDF5 file. To avoid to collapse RAM memory if too many
#' single-cell profiles are simulated, single-cell profiles can be simulated and
#' written to HDF5 files in blocks of \code{block.size} size by setting
#' \code{block.processing = TRUE}.
#'
#' @param object \code{\linkS4class{DigitalDLSorter}} object with
#'   \code{single.cell.real} and \code{zinb.params} slots.
#' @param cell.ID.column Name or column number corresponding to the cell names
#'   of expression matrix in cells metadata.
#' @param cell.type.column Name or column number corresponding to the cell type
#'   of each cell in cells metadata.
#' @param n.cells Number of simulated cells generated per cell type (i.e. if you
#'   have 10 different cell types in your dataset, if \code{n.cells = 100}, then
#'   1000 cell profiles will be simulated).
#' @param suffix.names Suffix used on simulated cells. This suffix must be
#'   unique in the simulated cells, so make sure that this suffix does not
#'   appear in the real cell names.
#' @param cell.types Vector indicating the cell types to simulate. If
#'   \code{NULL} (by default), \code{n.cells} single-cell profiles for all cell
#'   types will be simulated.
#' @param file.backend Valid file path to store the simulated single-cell
#'   expression profiles as an HDF5 file (\code{NULL} by default). If provided,
#'   the data is stored in HDF5 files used as back-end by using the
#'   \pkg{DelayedArray}, \pkg{HDF5Array} and \pkg{rhdf5} packages instead of
#'   loading all data into RAM memory. This is suitable for situations where you
#'   have large amounts of data that cannot be loaded into memory. Note that
#'   operations on this data will be performed in blocks (i.e subsets of
#'   determined size) which may result in longer execution times.
#' @param name.dataset.backend Name of the dataset in HDF5 file to be used. Note
#'   that it cannot exist. If \code{NULL} (by default), a random dataset name
#'   will be used.
#' @param compression.level The compression level used if \code{file.backend} is
#'   provided. It is an integer value between 0 (no compression) and 9 (highest
#'   and slowest compression). See
#'   \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} from the
#'   \pkg{HDF5Array} package for more information.
#' @param block.processing Boolean indicating whether the data should be
#'   simulated in blocks (only if \code{file.backend} is used, \code{FALSE} by
#'   default). This functionality is suitable for cases where is not possible to
#'   load all data into memory and it leads to larger execution times.
#' @param block.size Only if \code{block.processing = TRUE}. Number of
#'   single-cell expression profiles that will be simulated in each iteration
#'   during the process. Larger numbers result in higher memory usage but
#'   shorter execution times. Set according to available computational resources
#'   (1000 by default). Note that it cannot be greater than the total number of
#'   simulated cells.
#' @param chunk.dims Specifies the dimensions that HDF5 chunk will have. If
#'   \code{NULL}, the default value is a vector of two items: the number of
#'   genes considered by the ZINB-WaVE model during the simulation and a single
#'   sample in order to reduce read times in the following steps. A larger
#'   number of columns written in each chunk can lead to longer read times in
#'   subsequent steps. Note that it cannot be greater than the dimensions of the
#'   simulated matrix.
#' @param verbose Show informative messages during the execution (\code{TRUE} by
#'   default).
#'
#' @return A \code{\linkS4class{DigitalDLSorter}} object with
#'   \code{single.cell.simul} slot containing a
#'   \code{\linkS4class{SingleCellExperiment}} object with the simulated
#'   single-cell expression profiles.
#'
#' @export
#'
#' @seealso \code{\link{estimateZinbwaveParams}}
#'
#' @examples 
#' set.seed(123) # reproducibility
#' sce <- SingleCellExperiment::SingleCellExperiment(
#'   assays = list(
#'     counts = matrix(
#'       rpois(30, lambda = 5), nrow = 15, ncol = 10, 
#'       dimnames = list(paste0("Gene", seq(15)), paste0("RHC", seq(10)))
#'     )
#'   ),
#'   colData = data.frame(
#'     Cell_ID = paste0("RHC", seq(10)),
#'     Cell_Type = sample(x = paste0("CellType", seq(2)), size = 10, 
#'                        replace = TRUE)
#'   ),
#'   rowData = data.frame(
#'     Gene_ID = paste0("Gene", seq(15))
#'   )
#' )
#' DDLS <- loadSCProfiles(
#'   single.cell.data = sce,
#'   cell.ID.column = "Cell_ID",
#'   gene.ID.column = "Gene_ID"
#' )
#' DDLS <- estimateZinbwaveParams(
#'   object = DDLS,
#'   cell.type.column = "Cell_Type",
#'   cell.ID.column = "Cell_ID",
#'   gene.ID.column = "Gene_ID", 
#'   subset.cells = 4,
#'   verbose = FALSE
#' )
#' DDLS <- simSCProfiles(
#'   object = DDLS,
#'   cell.ID.column = "Cell_ID",
#'   cell.type.column = "Cell_Type",
#'   n.cells = 2,
#'   verbose = TRUE
#' )
#'
#' @references Risso, D., Perraudeau, F., Gribkova, S. et al. (2018). A general
#'   and flexible method for signal extraction from single-cell RNA-seq data.
#'   Nat Commun 9, 284. doi: \doi{10.1038/s41467-017-02554-5}.
#'
#'   Torroja, C. and Sánchez-Cabo, F. (2019). digitalDLSorter: A Deep Learning
#'   algorithm to quantify immune cell populations based on scRNA-Seq data.
#'   Frontiers in Genetics 10, 978. doi: \doi{10.3389/fgene.2019.00978}.
#'   
simSCProfiles <- function(
  object,
  cell.ID.column,
  cell.type.column,
  n.cells,
  suffix.names = "_Simul",
  cell.types = NULL,
  file.backend = NULL,
  name.dataset.backend = NULL,
  compression.level = NULL,
  block.processing = FALSE,
  block.size = 1000,
  chunk.dims = NULL,
  verbose = TRUE
) {
  if (!is(object, "DigitalDLSorter")) {
    stop("Object provided is not of DigitalDLSorter class")
  }
  if (is.null(zinb.params(object))) {
    stop("'zinb.params' slot is empty. To simulate single-cell profiles, ",
         "DigitalDLSorter object must contain estimated parameters ",
         "for data by estimateZinbwaveParams function")
  }
  if (is.null(single.cell.real(object))) {
    stop("'single.cell.real' slot is empty. To simulate single-cell ", 
         "profiles, DigitalDLSorter object must contain the original ", 
         "data. See ?loadSCProfiles")
  }
  if (!is.null(single.cell.simul(object = object))) {
    warning("'single.cell.simul' slot already has a SingleCellExperiment 
            object. Note that it will be overwritten\n", 
            call. = FALSE, immediate. = TRUE)
  }
  if (missing(cell.ID.column) || is.null(cell.ID.column)) {
    stop("'cell.ID.column' argument is needed. Please, see ",
         "?simSCProfiles")
  } else if (missing(cell.type.column) || is.null(cell.type.column)) {
    stop("'cell.type.column' argument is needed. Please, see ?simSCProfiles")
  } else if (missing(n.cells) || is.null(n.cells)) {
    stop("'n.cells' argument is needed. Please, see ?simSCProfiles")
  } else if (!is.numeric(n.cells)) {
    stop("'n.cells' argument must be a integer. Please, see ?simSCProfiles")
  } else if (n.cells <= 0) {
    stop("'n.cells' must be greater than 0")
  }
  # check if parameters related to hdf5 file are correct
  if (!is.null(file.backend)) {
    if (!requireNamespace("DelayedArray", quietly = TRUE) || 
        !requireNamespace("HDF5Array", quietly = TRUE)) {
      stop("digitalDLSorteR provides the possibility of using HDF5 files as back-end
         when data are too big to be located in RAM. It uses DelayedArray, 
         HDF5Array and rhdf5 to do it. Please Please install both packages to 
         use this functionality")
    } 
    hdf5Params <- .checkHDF5parameters(
      file.backend = file.backend, 
      name.dataset.backend = name.dataset.backend, 
      compression.level = compression.level
    )
    name.dataset.backend <- hdf5Params[[1]]
    compression.level <- hdf5Params[[2]]
  }
  # extract data and model
  list.data <- .extractDataFromSCE(
    SCEobject = single.cell.real(object),
    cell.ID.column = cell.ID.column,
    new.data = FALSE
  )
  zinb.object <- zinb.params(object)@zinbwave.model
  # check if cell.ID.column and cell.type.column are correct
  mapply(
    function(x, y) {
      .checkColumn(
        metadata = list.data[[2]],
        ID.column = x,
        type.metadata = "cells.metadata",
        arg = y
      )
    },
    c(cell.ID.column, cell.type.column),
    c("'cell.ID.column'", "'cell.type.column'")
  )
  # for cases where estimation was performed in some cell types
  cell.types.used <- list.data[[2]][rownames(zinb.object@X), 
                                    cell.type.column] %>% unique()
  # generate metadata for simulated cells
  colnames(list.data[[1]]) <- paste(
    list.data[[2]][, cell.type.column], 
    list.data[[2]][, cell.ID.column], sep = "_"
  ) 
  list.data[[2]]$simCellName <- paste(
    list.data[[2]][, cell.type.column],
    list.data[[2]][, cell.ID.column], sep = "_"
  )
  list.data[[2]]$Simulated <- FALSE
  # cell types in model
  cell.set.names <- NULL
  model.cell.types <- grep(
    pattern = cell.type.column,
    x = colnames(zinb.object@X),
    value = T
  )
  cell.type.names <- sub(
    pattern = cell.type.column,
    replacement = "",
    x = model.cell.types
  )
  if (!is.null(cell.types)) {
    if (!all(cell.types %in% cell.types.used)) {
      stop("Cell type(s) provided in 'cell.types' not found in ZINB-WaVE model.",
           "\n  Only cell types that have been used during estimation of ",
           "parameters can be simulated")
    }
    cell.sel <- cell.type.names %in% cell.types
    cell.type.names <- cell.type.names[cell.sel]
    model.cell.types <- model.cell.types[cell.sel]
  }
  names(cell.type.names) <- model.cell.types

  for (s in model.cell.types) {
    cell.type.name <- cell.type.names[s]
    cell.index <- rownames(zinb.object@X)[which(zinb.object@X[, s] == 1)]
    nams <- sample(cell.index, size = n.cells, replace = T)
    if (is.null(cell.set.names)) {
      cell.set.names <- nams
      names(cell.set.names) <- paste(
        cell.type.name, suffix.names, seq(from = 1, to = n.cells), sep = ""
      )
    } else {
      ns <- names(cell.set.names)
      cell.set.names <- c(cell.set.names, nams)
      names(cell.set.names) <- c(
        ns, paste(
          cell.type.name, suffix.names, 
          seq(from = length(ns) + 1, to = length(ns) + n.cells), sep = ""
        )
      )
    }
  }
  intercept.celltype <- FALSE
  if (!is.null(cell.types)) {
    inter.cell.type <- setdiff(cell.types, cell.type.names)
    if (length(inter.cell.type) != 0) intercept.celltype <- TRUE
  } else {
    inter.cell.type <- setdiff(cell.types.used, cell.type.names)
    intercept.celltype <- TRUE
  }
  if (intercept.celltype) {
    # to get the intercept cell type the rowSum of all FinalCellType columns should be 0
    cell.index <- rownames(zinb.object@X)[rowSums(
      zinb.object@X[, grep(cell.type.column, 
                                 colnames(zinb.object@X), 
                                 value = T), drop = FALSE]
    ) == 0]
    nams <- sample(cell.index, size = n.cells, replace = T)
    ns <- names(cell.set.names)
    cell.set.names <- c(cell.set.names, nams)
    names(cell.set.names) <- c(
      ns, paste(inter.cell.type, suffix.names,
                seq(from = length(ns) + 1,
                    to = length(ns) + n.cells),
                sep = "")
    )
  }
  # getting parameters from zinb-wave model
  mu <- zinbwave::getMu(zinb.object) # rows are cells
  pi <- zinbwave::getPi(zinb.object) # rows are cells
  theta <- zinbwave::getTheta(zinb.object) # for genes
  # setting dimensions of simulated matrix
  n <- length(cell.set.names) # as.numeric(nCells)
  J <- zinbwave::nFeatures(zinb.object)
  if (verbose) {
    # message about parameters
    message("=== Getting parameters from model:")
    message("    - mu: ", paste(dim(mu), collapse = ", "))
    message("    - pi: ", paste(dim(pi), collapse = ", "))
    message("    - Theta: ", length(theta), "\n")
    # message about selected cell types
    message(paste0("=== Selected cell type(s) from ZINB-WaVE model (", 
                   length(c(cell.type.names, inter.cell.type)), 
                   " cell type(s)):"))
    message(paste0("    - ", c(cell.type.names, inter.cell.type), 
                   collapse = "\n"), "\n")
    # messages about simulated matrix
    message("=== Simulated matrix dimensions:")
    message("    - n (cells): ", n)
    message("    - J (genes): ", J)
    message("    - i (# entries): ", n * J)
  }
  if (block.processing && is.null(file.backend)) {
    stop("'block.processing' is only compatible with the use of HDF5 files ", 
         "as back-end ('file.backend' argument)")
  } else if (block.processing && !is.null(file.backend)) {
    if (!requireNamespace("DelayedArray", quietly = TRUE) || 
        !requireNamespace("HDF5Array", quietly = TRUE)) {
      stop("digitalDLSorteR provides the possibility of using HDF5 files as back-end
         when data are too big to be located in RAM. It uses DelayedArray, 
         HDF5Array and rhdf5 to do it. Please install both packages to 
         use this functionality")
    } 
    if (n < block.size) {
      block.size <- n
      warning("The number of simulated cells is less than 'block.size'. ",
              "Only one block will be performed", 
              call. = FALSE, immediate. = TRUE)
    }
    if (verbose) 
      message("\n=== Simulating and writing new single-cell profiles by blocks")
    if (is.null(chunk.dims)){
      chunk.dims <- c(J, 1)
    } else {
      if (any(chunk.dims > c(J, n))) {
        warning("'chunk.dims' must be equal to or less than dimension of ", 
                "data. Setting default value", call. = FALSE, immediate. = TRUE)
        chunk.dims <- c(J, 1)
      }
    }
    if (!file.exists(file.backend)) rhdf5::h5createFile(file = file.backend) 
    rhdf5::h5createDataset(
      file = file.backend, dataset = name.dataset.backend, 
      dims = c(J, block.size), maxdims = c(J, n), 
      chunk = chunk.dims, storage.mode = "integer"
    )
    # pointers for hdf5 file and zinb-wave parameters
    r.i <- 0
    r.j <- 0
    # iteration over cells 
    for (iter in seq(ceiling(n / block.size))) {
      if ((block.size * iter) - n > 0) { # && dif < block.size
        dif <- (block.size * iter) - n
        block.size <- block.size - dif
      } else {
        dif <- block.size
      }
      sub.i <- seq(from = r.i + 1, to = r.i + block.size)
      sub.j <- seq(from = r.j + 1, to = r.j + block.size * J)
      r.i <- r.i + block.size
      r.j <- r.j + block.size
      datanb <- rnbinom(length(sub.j), mu = mu[cell.set.names[sub.i], ], 
                        size = rep(theta[1], length(sub.j))) # ceiling(sub.i/block.size)
      data.nb <- matrix(datanb, nrow = block.size)
      datado <- rbinom(length(sub.j), size = 1, prob = pi[cell.set.names[sub.i], ])
      data.dropout <- matrix(datado, nrow = block.size)
      sim.counts <- t(data.nb * (1 - data.dropout))
      if (iter == 1) {
        rhdf5::h5write(
          obj = sim.counts, file = file.backend, 
          name = name.dataset.backend, level = compression.level
        )
      } else {
        # check number of cells in the next loop
        rhdf5::h5set_extent(
          file = file.backend, dataset = name.dataset.backend, dims = c(J, n)
        )
        rhdf5::h5write(
          obj = sim.counts, 
          file = file.backend, name = name.dataset.backend, 
          index = list(seq(J), seq((dif * (iter - 1)) + 1, 
                                   (dif * (iter - 1)) + ncol(sim.counts))),
          level = compression.level
        )
      }
      if (verbose) message("    - Block ", iter, " written")
    }
    rhdf5::H5close()
    # HDF5Array object for SingleCellExperiment class
    sim.counts <- HDF5Array::HDF5Array(
      file = file.backend, 
      name = name.dataset.backend
    )
    dimnames(sim.counts) <- list(rownames(zinb.object@V), 
                                 names(cell.set.names))
  } else if (!block.processing) {
    i <- seq(n * J)
    mu <- mu[cell.set.names, ]
    pi <- pi[cell.set.names, ]
    datanb <- rnbinom(n * J, mu = mu[i], size = theta[ceiling(i/n)])
    data.nb <- matrix(datanb, nrow = n)
    
    datado <- rbinom(length(i), size = 1, prob = pi[i])
    data.dropout <- matrix(datado, nrow = n)
    
    sim.counts <- t(data.nb * (1 - data.dropout))
    sim.counts <- Matrix::Matrix(
      sim.counts, dimnames = list(
        rownames = rownames(zinb.object@V),
        colnames = names(cell.set.names))
    )  
  }
  sim.cells.metadata <- list.data[[2]][cell.set.names, ]
  sim.cells.metadata$simCellName <- NULL # it is not used
  sim.cells.metadata[, cell.ID.column] <- names(cell.set.names)
  rownames(sim.cells.metadata) <- names(cell.set.names)
  sim.cells.metadata$Simulated <- TRUE
  if (any(colnames(sim.cells.metadata) == "suffix"))
    warning("\n'suffix' column in cells metadata is going to be overwritten")
  
  sim.cells.metadata$suffix <- suffix.names
  if (any(apply(sim.counts, MARGIN = 2, FUN = sum) == 0)) {
    warning(
      paste(
        "Some of the simulated cells have a library size equal to zero.", 
        "This may be because the original library size of these cells is", 
        "too low. They will be removed for further analysis, so check your", 
        "initial parameters when loading scRNA-seq data"
      )
    )
    pos.cells <- which(apply(sim.counts, MARGIN = 2, FUN = sum) != 0)
    sim.counts <- sim.counts[, pos.cells]
    sim.cells.metadata <- sim.cells.metadata[pos.cells, ]
  }
  
  sim.sce <- .createSCEObject(
    counts = sim.counts,
    cells.metadata = sim.cells.metadata,
    genes.metadata = list.data[[3]][rownames(sim.counts), ],
    file.backend = file.backend,
    name.dataset.backend = name.dataset.backend,
    compression.level = compression.level,
    chunk.dims = chunk.dims,
    block.processing = block.processing,
    verbose = verbose
  )
  single.cell.simul(object) <- sim.sce
  if (verbose) message("\nDONE")
  return(object)
}

Try the digitalDLSorteR package in your browser

Any scripts or data that you put into this service are public.

digitalDLSorteR documentation built on Oct. 5, 2022, 9:05 a.m.