R/simulator.R

Defines functions merge_scaling_factor merge_simulations plot_simulation save_simulation cpm_normalize calc_scaling_vector simulate_bulk simulate_sample

Documented in calc_scaling_vector merge_scaling_factor merge_simulations plot_simulation save_simulation simulate_bulk simulate_sample

#' SimBu: Bias-aware simulation of bulk RNA-seq data with variable cell type composition
#'
#' As complex tissues are typically composed of various cell types, deconvolution tools have been developed to computationally
#' infer their cellular composition from bulk RNA sequencing (RNA-seq) data. To comprehensively assess deconvolution performance,
#' gold-standard datasets are indispensable. The simulation of ‘pseudo-bulk’ data, generated by aggregating single-cell RNA-seq (scRNA-seq)
#' expression profiles in pre-defined proportions, offers a scalable and cost-effective way of generating these gold-standard datasets.
#' SimBu was developed to simulate pseudo-bulk samples based on various simulation scenarios, designed to test specific features of deconvolution
#' methods. A unique feature of SimBu is the modelling of cell-type-specific mRNA bias using experimentally-derived or data-driven scaling factors.
#'
#' @section Dataset generation:
#' You will need an annotated scRNA-seq dataset (as matrix file, h5ad file, Seurat object), which is the baseline for the simulations.
#' Use the dataset_* functions to generate a SummarizedExperiment, that holds all important information.
#' It is also possible to access scRNA-seq datasets through the public database Sfaira, by using the functions dataset_sfaira() and
#' dataset_sfaira_multiple().
#'
#' @section Simulation:
#' Use the simulate_bulk() function to generate multiple pseudo-bulk samples, which will be returned as a SummarizedExperiment. You can
#' adapt the cell type fractions in each sample by changing the scenario parameter.
#'
#' @section Visulaization:
#' Inspect the cell type composition of your simulations with the plot_simulation() function.
#'
#' @docType package
#' @name SimBu
NULL
#> NULL

#' simulate single pseudo-bulk sample
#'
#' function to sample cells according to given cell-type fractions. This creates a single pseudo-bulk sample by calculating the
#' mean expression value per gene over all sampled cells.
#' Note: if total_read_counts is used, the cell-fractions are applied to the number of counts, not the number of cells!
#' @param data \link[SummarizedExperiment]{SummarizedExperiment} object
#' @param scaling_vector vector with scaling values for each cell; calculated by the \code{calc_scaling_vector} function
#' @param simulation_vector named vector with wanted cell-types and their fractions
#' @param total_cells numeric; number of total cells for this simulation
#' @param total_read_counts numeric; sets the total read count value for each sample
#' @param remove_bias_in_counts boolean; if TRUE (default) the internal mRNA bias that is present in count data will be *removed* using the number of reads mapped to each cell
#' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed
#' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE
#' @param seed numeric; fix this value if you want the same cells to be sampled
#'
#' @return returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset
#'
#' @keywords internal
simulate_sample <- function(data,
                            scaling_vector,
                            simulation_vector,
                            total_cells,
                            total_read_counts,
                            remove_bias_in_counts,
                            remove_bias_in_counts_method,
                            norm_counts,
                            seed) {
  if (!all(names(simulation_vector) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) {
    stop("Some cell-types in the provided simulation vector are not in the annotation.")
  }

  if (!all.equal(sum(simulation_vector), 1)) {
    em <- paste0("The sum of the cell-type fractions can not be larger than 1. The sum currently is: ", sum(simulation_vector))
    stop(em)
  }

  # loop over all wanted cell-types and sample to have the final amount
  sampled_cells <- lapply(seq_along(simulation_vector), function(x) {
    # get all cells with the current type
    cells_of_type_x <- data.frame(SummarizedExperiment::colData(data)[SummarizedExperiment::colData(data)[["cell_type"]] == names(simulation_vector[x]), ])

    if (simulation_vector[x] == 0) {
      cells <- c()
    } else {
      # how many cells of this type do we need?
      n <- round(total_cells * simulation_vector[x])
      if (n == 0) {
        n <- 1
      }
      if(!is.na(seed)){
        # fix seed for random selection of cells
        set.seed(seed)
      }
      cells <- dplyr::slice_sample(cells_of_type_x, n = n, replace = TRUE)
      cells <- cells[["cell_ID"]]
    }

    return(cells)
  })

  # annotation
  # names(sampled_cells) <- names(simulation_vector)
  # simulated_annotation <- utils::stack(sampled_cells)

  # get the corresponding columns from the data
  data_sampled <- data[, unlist(sampled_cells)]

  # remove mRNA bias from count data by dividing count matrix by nReads or nGenes per cell
  if (remove_bias_in_counts) {
    if (remove_bias_in_counts_method == "read-number") {
      remove_scaling_vector <- data_sampled[["nReads_SimBu"]] # access meta data in SummarizedExperiment directly like this
    } else if (remove_bias_in_counts_method == "gene-number") {
      remove_scaling_vector <- data_sampled[["nGenes_SimBu"]]
    } else {
      stop('If you want to remove the internal mRNA bias from counts, please specify the method you want to use with remove_bias_in_counts_method. Possible are "read-number" or "gene-number".')
    }
    # replace count matrix in SummarizedExperiment
    count_matrix_removed_bias <- Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["counts"]]) / remove_scaling_vector)
    SummarizedExperiment::assays(data_sampled)[["counts"]] <- count_matrix_removed_bias
  }

  # apply scaling vector on the sampled cells in the matrices
  scaling_vector <- scaling_vector[unlist(sampled_cells)]

  simulated_count_vector <- NULL
  simulated_tpm_vector <- NULL

  # apply scaling factor
  # & sum up counts/tpms of all cells per gene to get single bulk sample
  # & apply possible normalization of sample

  simulated_count_vector <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["counts"]]) * scaling_vector))

  # if total_read_counts is used, rarefy/scale count data to get the desired sequencing depth
  if (!is.null(total_read_counts)) {
    sum_counts <- sum(simulated_count_vector)

    if (sum_counts > total_read_counts) {
      tmp_phylo <- phyloseq::otu_table(data.frame(simulated_count_vector), taxa_are_rows = TRUE)
      tmp_vec <- data.frame(phyloseq::rarefy_even_depth(physeq = tmp_phylo, sample.size = total_read_counts, trimOTUs = FALSE, ))[, 1]
      names(tmp_vec) <- names(simulated_count_vector)
      simulated_count_vector <- tmp_vec
    } else if (sum_counts < total_read_counts) {
      simulated_count_vector <- simulated_count_vector / sum(simulated_count_vector) * total_read_counts
    }
  }

  # normalize sample based on counts to CPMs
  if (norm_counts) {
    simulated_count_vector <- (simulated_count_vector / sum(simulated_count_vector)) * 1e6
  }

  # only generate simulation based on TPM if tpm assay is present
  if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
    simulated_tpm_vector <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["tpm"]]) * scaling_vector))
    # normalize tpm based sample to 1e6
    simulated_tpm_vector <- (simulated_tpm_vector / sum(simulated_tpm_vector)) * 1e6
  }

  return(list(
    simulated_count_vector = simulated_count_vector,
    simulated_tpm_vector = simulated_tpm_vector
  ))
}


#' Simulate whole pseudo-bulk RNAseq dataset
#'
#' This function allows you to create a full pseudo-bulk RNAseq dataset. You need to provide a \link[SummarizedExperiment]{SummarizedExperiment} from which the cells
#' will be sampled for the simulation. Also a \code{scenario} has to be selected, where you can choose how the cells will be sampled and a
#' \code{scaling_factor} on how the read counts will be transformed proir to the simulation.
#'
#' @param data (mandatory) \link[SummarizedExperiment]{SummarizedExperiment} object
#' @param scenario (mandatory) select on of the pre-defined cell-type fraction scenarios; possible are: \code{even},\code{random},\code{mirror_db},\code{pure},\code{weighted}; you can also use the \code{custom} scenario, where you need to set the \code{custom_scenario_data} parameter.
#' @param scaling_factor (mandatory) name of scaling factor; possible are: \code{census}, \code{spike_in}, \code{read_number}, \code{expressed_genes}, \code{custom}, \code{epic}, \code{abis}, \code{quantiseq} or \code{NONE} for no scaling factor
#' @param scaling_factor_single_cell boolean: decide if a scaling value for each single cell is calculated (default) or the median of all scaling values for each cell type is calculated
#' @param weighted_cell_type name of cell-type used for \code{weighted} scenario
#' @param weighted_amount fraction of cell-type used for \code{weighted} scenario; must be between \code{0} and \code{0.99}
#' @param pure_cell_type name of cell-type for \code{pure} scenario
#' @param custom_scenario_data dataframe; needs to be of size \code{nsamples} x number_of_cell_types, where each sample is a row and each entry is the cell-type fraction. Rows need to sum up to 1.
#' @param custom_scaling_vector named vector with custom scaling values for cell-types. Cell-types that do not occur in this vector but are present in the dataset will be set to 1; mandatory for \code{custom} scaling factor
#' @param balance_even_mirror_scenario balancing value for the \code{uniform} and \code{mirror_db} scenarios: increasing it will result in more diverse simulated fractions. To get the same fractions in each sample, set to 0. Default is 0.01.
#' @param remove_bias_in_counts boolean; if TRUE the internal mRNA bias that is present in count data will be *removed* using the number of reads mapped to each cell. Default to FALSE
#' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed
#' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE
#' @param nsamples numeric; number of samples in pseudo-bulk RNAseq dataset (default = 100)
#' @param ncells numeric; number of cells in each dataset (default = 1000)
#' @param total_read_counts numeric; sets the total read count value for each sample
#' @param whitelist list; give a list of cell-types you want to keep for the simulation; if NULL, all are used
#' @param blacklist list; give a list of cell-types you want to remove for the simulation; if NULL, all are used; is applied after whitelist
#' @param seed numeric; specifiy a seed for RNG. This effects cell sampling; with a fixed seed you will always sample the same cells for each sample (seed value is incrased by 1 for each sample). Default = NA (two simulation runs will sample different cells).
#' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)
#' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default
#'
#'
#' @return named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset
#' \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample;
#' \code{scaling_vector} scaling value for each cell in dataset
#' @export
#'
#' @examples
#' # generate sample single-cell data to work with:
#'
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#'   "ID" = paste0("cell_", rep(1:300)),
#'   "cell_type" = c(
#'     rep("T cells CD4", 50),
#'     rep("T cells CD8", 50),
#'     rep("Macrophages", 100),
#'     rep("NK cells", 10),
#'     rep("B cells", 70),
#'     rep("Monocytes", 20)
#'   )
#' )
#'
#' dataset <- SimBu::dataset(
#'   annotation = annotation,
#'   count_matrix = counts,
#'   tpm_matrix = tpm,
#'   name = "test_dataset"
#' )
#'
#' # this creates a basic pseudo-bulk dataset with uniform cell-type distribution
#' # and no additional transformation of the data with 10 samples and 2000 cells each
#'
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100
#' )
#'
#' # use a blacklist to exclude certain cell-types for the simulation
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 2000,
#'   blacklist = c("Monocytes", "Macrophages")
#' )
#'
#'
#' # use the pure scenario to only have B cells
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "pure",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100,
#'   pure_cell_type = "B cells"
#' )
#'
#' # simulate a dataset with custom cell-type fraction for each of the 3 samples
#' fractions <- data.frame(
#'   "B cells" = c(0.2, 0.4, 0.2),
#'   "T cells CD4" = c(0.4, 0.2, 0.1),
#'   "Macrophages" = c(0.4, 0.4, 0.7), check.names = FALSE
#' )
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "custom",
#'   scaling_factor = "NONE",
#'   nsamples = 3,
#'   ncells = 2000,
#'   custom_scenario_data = fractions
#' )
#'
simulate_bulk <- function(data,
                          scenario = c("even", "random", "mirror_db", "weighted", "pure", "custom"),
                          scaling_factor = c("NONE", "census", "spike_in", "custom", "read_number", "expressed_genes", "annotation_column", "epic", "abis", "quantiseq"),
                          scaling_factor_single_cell = TRUE,
                          weighted_cell_type = NULL,
                          weighted_amount = NULL,
                          pure_cell_type = NULL,
                          custom_scenario_data = NULL,
                          custom_scaling_vector = NULL,
                          balance_even_mirror_scenario = 0.01,
                          remove_bias_in_counts = FALSE,
                          remove_bias_in_counts_method = "read-number",
                          norm_counts = FALSE,
                          nsamples = 100,
                          ncells = 1000,
                          total_read_counts = NULL,
                          whitelist = NULL,
                          blacklist = NULL,
                          seed = NA,
                          BPPARAM = BiocParallel::bpparam(),
                          run_parallel = FALSE) {
  # switch multi-threading on/off
  if (!run_parallel) {
    BPPARAM <- BiocParallel::MulticoreParam(workers = 1)
  } else {
    message("Using parallel generation of simulations.")
  }

  # keep only cell-types which are in whitelist in annotation & count matrix
  if (!is.null(whitelist)) {
    if (!all(whitelist %in% SummarizedExperiment::colData(data)[["cell_type"]])) {
      stop("Did not find all cell-types of whitelist in annotation.")
    }
    data <- data[, SummarizedExperiment::colData(data)[SummarizedExperiment::colData(data)[["cell_type"]] %in% whitelist, ][["cell_ID"]]]
    if (dim(data)[2] == 0) {
      stop("No cells are left after using this whitelist; please check that the correct names are used.")
    }
  }

  # remove cell-types which are in blacklist from annotation & count matrix
  if (!is.null(blacklist)) {
    if (!all(blacklist %in% SummarizedExperiment::colData(data)[["cell_type"]])) {
      stop("Did not find all cell-types of blacklist in annotation.")
    }
    data <- data[, SummarizedExperiment::colData(data)[!SummarizedExperiment::colData(data)[["cell_type"]] %in% blacklist, ][["cell_ID"]]]
    if (dim(data)[2] == 0) {
      stop("No cells are left after using this blacklist; please check that the correct names are used.")
    }
  }

  ##### different cell-type scenarios #####

  # each existing cell-type will be appearing in equal amounts
  if (scenario == "even") {
    all_types <- unique(SummarizedExperiment::colData(data)[["cell_type"]])
    n_cell_types <- length(all_types)
    simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
      m <- round(matrix(abs(stats::rnorm(length(all_types), mean = 1 / length(all_types), sd = balance_even_mirror_scenario)), ncol = n_cell_types), 3)
      m <- sweep(m, 1, rowSums(m), FUN = "/")
      simulation_vector <- as.vector(m[1, ])
      names(simulation_vector) <- all_types
      return(simulation_vector)
    })
    # give each sample a name
    sample_names <- paste0("even_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }
  # generate random cell-type fractions (depending on appearance in database)
  if (scenario == "random") {
    # generate 'nsamples' random samples
    simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
      n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]]))
      # generate n_cell_type amount of random fractions from the uniform distribution, which will sum up to 1
      m <- round(matrix(stats::runif(n_cell_types, 0, 1), ncol = n_cell_types), 3)
      m <- sweep(m, 1, rowSums(m), FUN = "/")
      simulation_vector <- as.vector(m[1, ])
      names(simulation_vector) <- unique(SummarizedExperiment::colData(data)[["cell_type"]])
      return(simulation_vector)
    })
    sample_names <- paste0("random_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }
  # generate cell-type fractions, which mirror the fraction of each cell-type in the used dataset
  if (scenario == "mirror_db") {
    n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]]))
    # generate 'nsamples' random samples
    simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
      # each cell-type will be represented as many times as it occurs in the used dataset
      mirror_values <- table(SummarizedExperiment::colData(data)[["cell_type"]]) / ncol(data)
      m <- unlist(lapply(mirror_values, function(y) {
        return(abs(round(stats::rnorm(1, mean = y, sd = balance_even_mirror_scenario), 3)))
      }))
      m <- matrix(m, ncol = n_cell_types)
      m <- sweep(m, 1, rowSums(m), FUN = "/")
      simulation_vector <- as.vector(m[1, ])
      names(simulation_vector) <- names(mirror_values)
      return(simulation_vector)
    })
    sample_names <- paste0("mirror_db_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }
  # one cell-type will be highly over represented, the others are random
  if (scenario == "weighted") {
    if (is.null(weighted_cell_type) || is.null(weighted_amount)) {
      stop("The weighted scenario requires you to select one cell-type which will be over represented")
    }
    if (weighted_amount > 0.99 || weighted_amount < 0) {
      stop("The weighted cell-type fraction needs to be between 0 and 0.99.")
    }
    if (!weighted_cell_type %in% unique(SummarizedExperiment::colData(data)[["cell_type"]])) {
      stop("The weighted cell-type could not be found in your dataset.")
    }

    random_cell_types <- setdiff(unique(SummarizedExperiment::colData(data)[["cell_type"]]), weighted_cell_type)
    all_cell_types <- c(random_cell_types, weighted_cell_type)

    simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
      n_cell_types <- length(unique(SummarizedExperiment::colData(data)[["cell_type"]])) - 1
      # generate n_cell_type amount of random fractions from the uniform distribution, which will sum up to 1
      m <- matrix(stats::runif(n_cell_types, 0, 1), ncol = n_cell_types)
      simulation_vector <- as.vector(m[1, ])
      simulation_vector <- (1 - weighted_amount) * simulation_vector / sum(simulation_vector)
      simulation_vector <- append(simulation_vector, weighted_amount)
      names(simulation_vector) <- all_cell_types
      return(simulation_vector)
    })
    sample_names <- paste0("weighted_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }
  # pure: only simulate a single cell-type
  if (scenario == "pure") {
    if (is.null(pure_cell_type)) {
      stop("The pure scenario requires you to select a cell-type which will be simulated")
    }
    simulation_vector_list <- lapply(seq_len(nsamples), function(x) {
      simulation_vector <- c(1)
      names(simulation_vector) <- as.character(pure_cell_type)
      return(simulation_vector)
    })
    sample_names <- paste0("pure_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }
  # custom fractions
  if (scenario == "custom") {
    if (is.null(custom_scenario_data)) {
      stop("You need to provide a dataframe with you custom cell-type fractions in this scenario with the custom_scenario_data parameter.")
    }
    # check dimensions and cell-types in dataframe; samples are rows
    if (nrow(custom_scenario_data) != nsamples) {
      stop("The scenario data has a differnt amount of samples than you want to simulate.")
    }
    if (!all(colnames(custom_scenario_data) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) {
      stop("Could not find all cell-types from scenario data in annotation.")
    }
    simulation_vector_list <- as.list(as.data.frame(t(custom_scenario_data)))
    simulation_vector_list <- lapply(simulation_vector_list, function(x) {
      names(x) <- colnames(custom_scenario_data)
      x <- x
    })
    sample_names <- paste0("custom_sample", seq_len(nsamples))
    names(simulation_vector_list) <- sample_names
  }

  ##### pre-calculate scaling factors ####

  scaling_vector <- calc_scaling_vector(
    data = data,
    scaling_factor = scaling_factor,
    custom_scaling_vector = custom_scaling_vector,
    scaling_factor_single_cell = scaling_factor_single_cell,
    BPPARAM = BPPARAM,
    run_parallel = run_parallel
  )

  ##### generate the samples #####

  # sample cells and generate pseudo-bulk profiles
  all_samples <- BiocParallel::bplapply(seq_along(simulation_vector_list), function(i) {
    
    simulation_vector <- simulation_vector_list[[i]]
    sample <- simulate_sample(
      data = data,
      scaling_vector = scaling_vector,
      simulation_vector = simulation_vector,
      total_cells = ncells,
      total_read_counts = total_read_counts,
      remove_bias_in_counts = remove_bias_in_counts,
      remove_bias_in_counts_method = remove_bias_in_counts_method,
      norm_counts = norm_counts,
      seed = ifelse(is.na(seed), seed, seed + i) # ensure that samples still are different if seed is set
    )

    return(sample)
  }, BPPARAM = BPPARAM)

  bulk_counts <- Matrix::Matrix(vapply(
    X = all_samples,
    FUN = "[[",
    ... = 1,
    FUN.VALUE = double(dim(SummarizedExperiment::assays(data)[["counts"]])[1])
  ),
  sparse = TRUE
  )
  if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
    bulk_tpm <- Matrix::Matrix(vapply(
      X = all_samples,
      FUN = "[[",
      ... = 2,
      FUN.VALUE = double(dim(SummarizedExperiment::assays(data)[["tpm"]])[1])
    ),
    sparse = TRUE
    )
    assays <- list(bulk_counts = bulk_counts, bulk_tpm = bulk_tpm)
  } else {
    assays <- list(bulk_counts = bulk_counts)
  }


  # new SummarizedExperiment with bulk datasets based on counts or TPM or both
  se_bulk <- SummarizedExperiment::SummarizedExperiment(assays = assays)
  colnames(se_bulk) <- sample_names

  # remove non-unique features/genes from simulated dataset
  if (length(unique(rownames(se_bulk))) != dim(se_bulk)[1]) {
    un <- unique(rownames(se_bulk))
    se_bulk <- se_bulk[un, ]
  }

  # cell_fractions for all simulated samples
  cell_fractions <- data.frame(t(data.frame(simulation_vector_list)), check.names = FALSE)

  message("Finished simulation.")

  return(list(
    bulk = se_bulk,
    cell_fractions = cell_fractions,
    scaling_vector = scaling_vector
  ))
}


#' Calculate scaling factor for a dataset
#'
#' Each scaling factor has a default matrix it will try to use (counts or TPM). If the required matrix is not available, the other one is used and a warning is given.
#'
#' @param data dataset object
#' @param scaling_factor name of scaling factor; possible are: \code{census}, \code{spike_in}, \code{read_number}, \code{custom} or \code{NONE} for no scaling factor
#' @param custom_scaling_vector named vector with custom scaling values for cell-types. Cell-types that do not occur in this vector but are present in the dataset will be set to 1
#' @param scaling_factor_single_cell boolean: decide if a scaling value for each single cell is calculated (default) or the median of all scaling values for each cell type is calculated
#' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)
#' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default
#'
#' @return a named vector with a scaling value for each cell in the dataset
#'
#' @keywords internal
calc_scaling_vector <- function(data, scaling_factor, custom_scaling_vector, scaling_factor_single_cell, BPPARAM, run_parallel) {
  if (scaling_factor == "census") {
    if ("tpm" %in% names(SummarizedExperiment::assays(data))) {
      m <- SummarizedExperiment::assays(data)[["tpm"]]
    } else {
      warning("Scaling factor 'Census' requires TPM data, which is not available in the given dataset. Counts will be used instead.")
      m <- SummarizedExperiment::assays(data)[["counts"]]
    }
    scaling_vector <- census(m, expr_threshold = 0.1, exp_capture_rate = .25, BPPARAM = BPPARAM, run_parallel = run_parallel)
    scaling_vector[which(is.na(scaling_vector))] <- 1
    scaling_vector <- scaling_vector / 1e6
  } else if (scaling_factor == "spike_in") {
    # if you want to transform your counts by spike_in data, an additional column in the annotation table is needed
    # with name "spike_in"; the matrix counts will then be transformed accordingly
    if (!"spike_in" %in% colnames(SummarizedExperiment::colData(data))) {
      stop("No column with spike_in information in annotation data. Check your dataset again!")
    }

    # get subset of spike_in counts from the sampled cells
    anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "spike_in", "nReads_SimBu")]
    tot_counts <- anno_sub[["nReads_SimBu"]] + anno_sub[["spike_in"]] # these are all the reads mapped to transcripts *and* spike-ins
    scaling_vector <- anno_sub[["nReads_SimBu"]] / tot_counts # this is the fractions of reads mapped to 'true' transcripts
    names(scaling_vector) <- anno_sub[["cell_ID"]]
  } else if (scaling_factor == "read_number") {
    anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "nReads_SimBu")]
    scaling_vector <- anno_sub[["nReads_SimBu"]]
    names(scaling_vector) <- anno_sub[["cell_ID"]]
  } else if (scaling_factor == "expressed_genes") {
    anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", "nGenes_SimBu")]
    scaling_vector <- anno_sub[["nGenes_SimBu"]]
    names(scaling_vector) <- anno_sub[["cell_ID"]]
  } else if (scaling_factor == "custom") {
    # needs vector with values for existing cell-types
    # cell-types that do not occur in this vector will have scaling-factor of 1
    if (is.null(custom_scaling_vector)) {
      stop("For the custom scaling factor you need to provide a custom_scaling_vector!")
    }
    message("Using custom scaling factors.")
    scaling_vector <- merge_scaling_factor(
      data = data,
      scaling_factor_values = custom_scaling_vector,
      scaling_factor_name = scaling_factor
    )
  } else if (scaling_factor == "NONE") {
    scaling_vector <- rep(1, ncol(data))
    names(scaling_vector) <- SummarizedExperiment::colData(data)[["cell_ID"]]
  } else if (scaling_factor == "epic") {
    message("Using EPIC scaling factors.")
    scaling_vector <- merge_scaling_factor(
      data = data,
      scaling_factor_values = pkg.globals$epic_scaling,
      scaling_factor_name = scaling_factor
    )
  } else if (scaling_factor == "abis") {
    message("Using ABIS scaling factors.")
    scaling_vector <- merge_scaling_factor(
      data = data,
      scaling_factor_values = pkg.globals$abis_scaling,
      scaling_factor_name = scaling_factor
    )
  } else if (scaling_factor == "quantiseq") {
    message("Using quanTIseq scaling factors.")
    scaling_vector <- merge_scaling_factor(
      data = data,
      scaling_factor_values = pkg.globals$qtsq_scaling,
      scaling_factor_name = scaling_factor
    )
  } else if (!is.null(scaling_factor)) {
    message("Scaling by custom column in annotation table; if no scaling is wished instead, use 'NONE'.")
    if (!scaling_factor %in% colnames(SummarizedExperiment::colData(data))) {
      em <- paste0("A column with the name ", scaling_factor, " cannot be found in the annotation.")
      stop(em)
    }
    if (!is.numeric(SummarizedExperiment::colData(data)[, c(scaling_factor)])) {
      em <- paste0("The column with the name ", scaling_factor, " is not numeric and cannot be used for scaling.")
      stop(em)
    }

    anno_sub <- SummarizedExperiment::colData(data)[, c("cell_ID", scaling_factor)]
    colnames(anno_sub) <- c("a", "b")
    scaling_vector <- anno_sub$b
    names(scaling_vector) <- anno_sub$a
  } else {
    warning("No valid scaling factor method provided. Scaling all cells by 1.")
    scaling_vector <- rep(1, ncol(data))
    names(scaling_vector) <- SummarizedExperiment::colData(data)[["cell_ID"]]
  }

  # calculate the median scaling value per cell-type
  if (!scaling_factor_single_cell) {
    df <- data.frame(
      value = scaling_vector,
      type = SummarizedExperiment::colData(data)[["cell_type"]],
      id = names(scaling_vector)
    )
    df$median_value <- stats::ave(df$value, df$type, FUN = stats::median)
    scaling_vector <- df[["median_value"]]
    names(scaling_vector) <- df[["id"]]
  }

  return(scaling_vector)
}


# normalize samples to one million -> TPM
cpm_normalize <- function(matrix) {
  m <- Matrix::t(1e6 * Matrix::t(matrix) / Matrix::colSums(matrix))
  if (sum(is.na(m)) > 0) {
    warning("There are NA entries in the normalized matrix.")
  }

  return(m)
}

#' Save the expression matrix of a simulated pseudo-bulk dataset to a file
#'
#' @param simulation the result of simulate_bulk()
#' @param filename the filename where to save the expression matrix to
#' @param assay name of the assay in simulation to save, default to bulk_counts
#'
#' @return write a file
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#'   "ID" = paste0("cell_", rep(1:300)),
#'   "cell_type" = c(
#'     rep("T cells CD4", 50),
#'     rep("T cells CD8", 50),
#'     rep("Macrophages", 100),
#'     rep("NK cells", 10),
#'     rep("B cells", 70),
#'     rep("Monocytes", 20)
#'   )
#' )
#'
#' dataset <- SimBu::dataset(
#'   annotation = annotation,
#'   count_matrix = counts,
#'   tpm_matrix = tpm,
#'   name = "test_dataset"
#' )
#'
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100
#' )
#'
#' save_simulation(s, tempfile())
save_simulation <- function(simulation, filename, assay = "bulk_counts") {
  utils::write.table(as.matrix(SummarizedExperiment::assays(simulation$bulk)[[assay]]), filename, quote = FALSE, sep = "\t")
}


#' Plot the cell-type fractions in your simulated dataset
#'
#' @param simulation a simulation object generated by \code{simulate_bulk}
#'
#' @return a gpplot2 barplot
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#'   "ID" = paste0("cell_", rep(1:300)),
#'   "cell_type" = c(
#'     rep("T cells CD4", 50),
#'     rep("T cells CD8", 50),
#'     rep("Macrophages", 100),
#'     rep("NK cells", 10),
#'     rep("B cells", 70),
#'     rep("Monocytes", 20)
#'   )
#' )
#'
#' dataset <- SimBu::dataset(
#'   annotation = annotation,
#'   count_matrix = counts,
#'   tpm_matrix = tpm,
#'   name = "test_dataset"
#' )
#'
#' s <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100
#' )
#'
#' SimBu::plot_simulation(s)
#'
plot_simulation <- function(simulation) {
  fractions <- simulation$cell_fractions
  fractions$sample <- factor(rownames(fractions), levels = rownames(fractions))
  frac_long <- tidyr::gather(fractions, "cell_type", "fraction", seq_len(length(fractions) - 1))

  ggplot2::ggplot(data = frac_long) +
    ggplot2::geom_col(ggplot2::aes_string(x = "fraction", y = "sample", fill = "cell_type")) +
    ggplot2::ggtitle(paste0("Cell-type fractions for \n", nrow(fractions), " pseudo-bulk RNAseq samples")) +
    ggplot2::scale_fill_manual(values = grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Paired"))(length(unique(frac_long$cell_type)))) +
    ggplot2::theme_bw()
}


#' Combine multiple simulations into one result
#'
#' we recommend to only merge simulations from the same dataset object, otherwise the count matrices might not correspond on the gene level
#'
#' @param simulation_list a list of simulations
#'
#' @return named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset
#' \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample;
#' \code{scaling_vector} scaling value for each cell in dataset
#' @export
#'
#' @examples
#' counts <- Matrix::Matrix(matrix(rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::Matrix(matrix(rpois(3e5, 5), ncol = 300), sparse = TRUE)
#' tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm))
#'
#' colnames(counts) <- paste0("cell_", rep(1:300))
#' colnames(tpm) <- paste0("cell_", rep(1:300))
#' rownames(counts) <- paste0("gene_", rep(1:1000))
#' rownames(tpm) <- paste0("gene_", rep(1:1000))
#'
#' annotation <- data.frame(
#'   "ID" = paste0("cell_", rep(1:300)),
#'   "cell_type" = c(
#'     rep("T cells CD4", 50),
#'     rep("T cells CD8", 50),
#'     rep("Macrophages", 100),
#'     rep("NK cells", 10),
#'     rep("B cells", 70),
#'     rep("Monocytes", 20)
#'   )
#' )
#'
#' dataset <- SimBu::dataset(
#'   annotation = annotation,
#'   count_matrix = counts,
#'   tpm_matrix = tpm,
#'   name = "test_dataset"
#' )
#'
#' s1 <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100
#' )
#'
#' s2 <- SimBu::simulate_bulk(dataset,
#'   scenario = "even",
#'   scaling_factor = "NONE",
#'   nsamples = 10,
#'   ncells = 100
#' )
#'
#' s <- SimBu::merge_simulations(list(s1, s2))
merge_simulations <- function(simulation_list) {
  # merge SummarizedExperiments
  if (length(simulation_list) <= 1) {
    stop("You need at least 2 simulations to merge them into one!")
  }

  n_assays <- unlist(lapply(simulation_list, function(x) {
    return(length(names(SummarizedExperiment::assays(x$bulk))))
  }))

  if (length(unique(n_assays)) != 1) {
    stop("The simulations you want to merge have different numbers of assays. Stopping.")
  }

  merged_se <- do.call(SummarizedExperiment::cbind, lapply(simulation_list, "[[", "bulk"))

  # merge cell fractions dataframe
  sample_names <- unlist(lapply(simulation_list, function(x) {
    rownames(x[["cell_fractions"]])
  }))
  sample_names <- paste0(sample_names, "_", seq_len(length(sample_names)))
  cell_fractions <- data.frame(data.table::rbindlist(lapply(simulation_list, function(x) {
    x[["cell_fractions"]]
  }), fill = TRUE, use.names = TRUE), check.names = FALSE)
  rownames(cell_fractions) <- sample_names
  cell_fractions[is.na(cell_fractions)] <- 0


  # list of scaling vectors
  scaling_vector <- lapply(
    X = simulation_list,
    FUN = "[[",
    ... = "scaling_vector"
  )
  names(scaling_vector) <- names(simulation_list)

  return(list(
    bulk = merged_se,
    cell_fractions = cell_fractions,
    scaling_vector = scaling_vector
  ))
}



#' Create scaling vector from custom or pre-defined scaling factor
#'
#' @param data dataset
#' @param scaling_factor_values named list of scaling values
#' @param scaling_factor_name name of scaling factor method
#'
#' @return scaling vector
#'
#' @keywords internal
merge_scaling_factor <- function(data, scaling_factor_values, scaling_factor_name) {
  # these are the name of cell types in dataset
  present_types <- unique(data[["cell_type"]])
  # names of cell types in dataset and scaling factor
  type_intersection <- Reduce(intersect, list(present_types, names(scaling_factor_values)))
  if (length(type_intersection) == 0) {
    em <- paste0("Could not apply ", scaling_factor_name, " scaling factor, because none of its cell types are present in the provided dataset.")
    stop(em)
  }
  # names of cell types which are in scaling factor, but not in dataset
  missing_types <- present_types[which(!present_types %in% names(scaling_factor_values))]
  if (length(missing_types) != 0) {
    wm <- paste0("For some cell type(s) in the dataset, no scaling factor is available when using ", scaling_factor_name, ": ", missing_types, ". This cell type will not be re-scaled.")
    warning(wm)
  }

  # missing types are not scaled -> value of 1
  complete_vector <- rep(1, length(missing_types))
  names(complete_vector) <- missing_types
  # combine with scaling values of factor, which correspond to cell types present in dataset
  complete_vector <- data.frame(value = append(complete_vector, scaling_factor_values[type_intersection]), check.names = FALSE)
  # give each cell the corresponding scaling value, depending on its type
  df <- merge(complete_vector, data.frame(SummarizedExperiment::colData(data)), by.x = 0, by.y = "cell_type", all.y = TRUE)[, c("value", "cell_ID")]
  scaling_vector <- df$value
  names(scaling_vector) <- df$cell_ID

  return(scaling_vector)
}
omnideconv/SimBu documentation built on May 5, 2024, 12:33 p.m.