immunedeconv2: Second generation methods for immune cell deconvolution

Documented in build_model_cpm calculate_cell_embedding deconvolute_cpm

#' No model is build as CPM does not use a signature matrix.
#'
#' Please use the deconvolute method with your single cell and bulk rna seq data to use CPM.
#'
#'
#' @return NULL.
#'
#' @export
build_model_cpm <- function() {
  message(
    "The deconvolution with CPM is done in only one step. Please just use the ",
    "deconvolute method."
  )

  return(NULL)
}

#' CPM Deconvolution
#'
#' This function is to calculate the CPM deconvolution proportions.
#' IMPORTANT: No model is needed. Everything is done inside this method.
#' This method is NOT deterministic, so if it is run multiple times, it will create different
#' outputs.
#'
#' This function initiate the Cellular Population Mapping (CPM) algorithm - a deconvolution
#' algorithm in which single-cell genomics is required in only one or a few samples, where in other
#' samples of the same tissue, only bulk genomics is measured and the underlying fine resolution
#' cellular heterogeneity is inferred.
#' CPM predicts the abundance of cells (and cell types) ranging monotonically from negative to
#' positive levels. Using a relative framework these values correspond to decrease and increase in
#' cell abundance levels, respectively. On the other hand, in an absolute framework lower values
#' (including negatives) correspond to lower abundances and vise versa. These values are comparable
#' between samples.
#'
#'
#' @param bulk_gene_expression A matrix of bulk data. Rows are genes, columns are samples.
#'   Row and column names need to be set.
#' @param single_cell_object A matrix with the single-cell data. Rows are genes, columns are
#'   samples. Row and column names need to be set.
#' @param cell_type_annotations A vector of the cell type annotations. Has to be in the same order
#'   as the samples in single_cell_object.
#' @param cell_space The cell state space corresponding to the single-cell RNA-seq data. It can be
#'   a vector for a 1-dim space or a 2D matrix for a two space where each column represents a
#'   different dimension. The cell space should incorporate the similarities of cells within cell
#'   types. Similarities between cells from different cell types, based on the cell space, are not
#'   taken into account in CPM.
#'   It is also possible to supply the string "PCA", "UMAP" or "TSNE" which calculates the cell
#'   space using the corresponding method (using the Seurat implementation and default parameters).
#' @param no_cores A number for the amount of cores which will be used for the analysis. The
#'   default (NULL) is total number of cores minus 1.
#' @param neighborhood_size Cell neighborhood size which will be used for the analysis. This should
#'   be lower than the number of cells in the smallest cell type. The default is 10.
#' @param model_size The reference subset size in each iteration of CPM. This should be lower than
#'   the total number of cells. The default is 50.
#' @param min_selection The minimum number of times in which each reference cell is selected.
#'   Increasing this value might have a large effect on the algorithm's running time.
#'   The default is 5.
#' @param calculate_CI A boolean parameter indicating whether the calculation of confidence
#'   intervals is needed. The default is FALSE.
#' @param verbose Whether to produce an output on the console.
#' @return A list including:
#' \item{predicted}{CPM predicted cell abundance matrix. Each row represents a sample and
#'   each column a single cell.}
#' \item{cellTypePredictions}{CPM predicted cell-type abundance matrix. Each row represents a sample
#'   and each column a single cell-type.}
#' \item{confIntervals}{A matrix containing the confidence interval for each cell and sample. Each
#'   row represents a sample and each column a single cell. This is calculated
#'   if calculate_CI = TRUE.}
#' \item{numOfRuns}{The number of deconvolution repeats preformed by CPM. }
#' @export
deconvolute_cpm <- function(bulk_gene_expression, single_cell_object, cell_type_annotations,
                            cell_space = "PCA", no_cores = NULL, neighborhood_size = 10,
                            model_size = 50, min_selection = 5, calculate_CI = FALSE,
                            verbose = FALSE) {
  if (is.null(bulk_gene_expression)) {
    stop("Parameter 'bulk_gene_expression' is missing or null, but it is required.")
  }
  if (is.null(single_cell_object)) {
    stop("Parameter 'single_cell_object' is missing or null, but it is required.")
  }
  if (is.null(cell_type_annotations)) {
    stop("Parameter 'cell_type_annotations' is missing or null, but it is required.")
  }

  if (ncol(bulk_gene_expression) < 2) {
    stop("CPM requires at least two bulk samples.")
  }

  if ("character" %in% class(cell_space) && length(cell_space) == 1) {
    cell_space <- calculate_cell_embedding(single_cell_object, cell_type_annotations, cell_space)
  }

  if (parallel::detectCores() > 125) {
    if (verbose) {
      message("Reduced the used cores to 125 because R only supports 125")
    }
    no_cores <- 125
  }


  return(scBio::CPM(single_cell_object, cell_type_annotations, bulk_gene_expression, cell_space,
    no_cores = no_cores, neighborhoodSize = neighborhood_size, modelSize = model_size,
    minSelection = min_selection, quantifyTypes = TRUE, typeTransformation = TRUE,
    calculateCI = calculate_CI
  ))
}




#' Calculation of the cell_space parameter needed by CPM
#'
#' @param single_cell_object A matrix with the single-cell data. Rows are genes, columns are
#'   samples. Row and column names need to be set.
#' @param cell_type_annotations A vector of the cell type annotations. Has to be in the same order
#'   as the samples in single_cell_object.
#' @param method Either "PCA", "UMAP" or "TSNE"
#'
#' @return A matrix with two dimensions. The rows are the cells, the columns are the two dimensions
#'  calculated by Seurat
calculate_cell_embedding <- function(single_cell_object, cell_type_annotations,
                                     method = c("PCA", "UMAP", "TSNE")) {
  if (length(method) > 1) {
    method <- method[1]
    message(paste0(method, " was chosen because multiple values were supplied for \"method\""))
  }
  method <- tolower(method)
  sce <- matrix_to_singlecellexperiment(single_cell_object, cell_type_annotations)
  seurat <- Seurat::as.Seurat(sce, counts = "X", data = NULL)

  seurat <- Seurat::NormalizeData(seurat)
  all.genes <- rownames(seurat)
  seurat <- Seurat::ScaleData(seurat, features = all.genes)
  seurat <- Seurat::FindVariableFeatures(seurat)
  seurat <- Seurat::RunPCA(seurat, features = Seurat::VariableFeatures(object = seurat))
  if (method == "pca") {
    return(seurat@reductions$pca@cell.embeddings[, 1:2])
  } else if (method == "umap") {
    seurat <- Seurat::RunUMAP(seurat, dims = 1:10)
    return(seurat@reductions$umap@cell.embeddings)
  } else if (method == "tsne") {
    seurat <- Seurat::RunTSNE(seurat)
    return(seurat@reductions$tsne@cell.embeddings)
  }
  stop("Method ", method, " not recognized")
}