R/EPIC_helper.R
In IOBR: Immune Oncology Biological Research

Documented in EPIC merge_duplicates scaleCounts

# ####################################'
#
# Package EPIC - developed by Julien Racle from David Gfeller's group of the
# University of Lausanne and Ludwig Institute for Cancer Research. Copyrights
# remain reserved as described in the included LICENSE file.
#
# ####################################'

#' Estimate the proportion of immune and cancer cells.
#'
#' \code{EPIC} takes as input bulk gene expression data (RNA-seq) and returns
#'  the proportion of mRNA and cells composing the various samples.
#'
#' This function uses a constrained least square minimization to estimate the
#' proportion of each cell type with a reference profile and another
#' uncharacterized cell type in bulk gene expression samples.
#'
#' The names of the genes in the bulk samples, the reference samples and in the
#' gene signature list need to be the same format (gene symbols are used in the
#' predefined reference profiles). The full list of gene names don't need to be
#' exactly the same between the reference and bulk samples: \emph{EPIC}
#' will use the intersection of the genes. In case of duplicate gene names,
#' \emph{EPIC} will use the median value per duplicate - if you want to consider
#' these cases differently, you can remove the duplicates before calling
#' \emph{EPIC}.
#'
#' @param bulk A matrix (\code{nGenes} x \code{nSamples}) of the genes
#'    expression from each bulk sample (the counts should be given in TPM,
#'    RPKM or FPKM when using the prebuilt reference profiles). This matrix
#'    needs to have rownames telling the gene names (corresponds to the gene
#'    symbol in the prebuilt reference profiles (e.g. CD8A, MS4A1) - no
#'    conversion of IDs is performed at the moment).
#'    It is advised to keep all genes in the bulk instead of a subset of
#'    signature genes (except if \code{scaleExprs = FALSE} in which case it
#'    doesn't make any difference).
#' @param reference (optional): A string or a list defining the reference cells.
#'    It can take multiple formats:
#'    - `NULL`: to use the default reference profiles and genes signature
#'    \code{TRef}
#'    - A character: either \code{"BRef"} or \code{"TRef"} to use the
#'    reference cells and genes signature of the corresponding datasets (see
#'    \code{BRef} and \code{TRef})
#'    - A list containing:
#'      - `$refProfiles`: a matrix (\code{nGenes} x \code{nCellTypes}) of the
#'      reference cells genes expression (don't include a column of the 'other
#'      cells' (representing usually the cancer cells for which such a profile
#'      is usually not conserved between samples); the rownames needs to be
#'      defined as well as the colnames giving the names of each gene and
#'      reference cell types respectively. It is advised to keep all genes in
#'      this \code{refProfiles} matrix instead of a subset of signature genes
#'      - `$sigGenes`: a character vector of the gene names to use as
#'      signature - sigGenes can also be given as a direct input to EPIC
#'      function
#'      - `$refProfiles.var` (optional): a matrix (\code{nGenes} x
#'      \code{nCellTypes}) of the variability of each gene expression for each
#'      cell type, which is used to define weights on each gene for the
#'      optimization (if this is absent, we assume an identical variability
#'      for all genes in all cells) - it needs to have the same dimnames than
#'      refProfiles
#' @param mRNA_cell (optional): A named numeric vector: tells (in arbitrary
#'    units) the amount of mRNA for each of the reference cells and of the
#'    other uncharacterized (cancer) cell. Two names are of special meaning:
#'    \emph{"otherCells"} - used for the mRNA/cell value of the "other cells"
#'    from the sample (i.e. the cell type that don't have any reference gene
#'    expression profile) ; and \emph{default} - used for the mRNA/cell of the
#'    cells from the reference profiles for which no specific value is given
#'    in mRNA_cell (i.e. if mRNA_cell=c(Bcells=2, NKcells=2.1, otherCells=3.5,
#'    default=1), then if the refProfiles described Bcells, NKcells and Tcells,
#'    we would use a value of 3.5 for the "otherCells" that didn't have any
#'    reference profile and a default value of 1 for the Tcells when computing
#'    the cell fractions).
#'    To note: if data is in tpm, this mRNA per cell would ideally correspond
#'    to some number of transcripts per cell.
#' @param mRNA_cell_sub (optional): This can be given instead of
#'   \code{mRNA_cell} (or in addition to it). It is also a named numeric vector,
#'   used to replace only the mRNA/cell values from some cell types (or to add
#'   values for new cell types). The values given in mRNA_cell_sub will overwrite
#'   the default values as well as those that might have been given by mRNA_cell.
#' @param sigGenes (optional): a character vector of the gene names to use as
#'    signature for the deconvolution. In principle this is given with the
#'    reference as the "reference$sigGenes" but if we give a value for this
#'    input variable, it is these signature genes that will be used instead of
#'    the ones given with the reference profile.
#' @param scaleExprs (optional, default is TRUE): boolean telling if the bulk
#'    samples and reference gene expression profiles should be rescaled based on
#'    the list of genes in common between the them (such a rescaling is
#'    recommanded).
#' @param withOtherCells (optional, default is TRUE): if EPIC should allow for
#'    an additional cell type for which no gene expression reference profile is
#'    available or if the bulk is assumed to be composed only of the cells with
#'    reference profiles.
#' @param constrainedSum (optional, default is TRUE): tells if the sum of all
#'    cell types should be constrained to be < 1. When
#'    \code{withOtherCells=FALSE}, there is additionally a constrain the the sum
#'    of all cell types with reference profiles must be > 0.99.
#' @param rangeBasedOptim (optional): when this is FALSE (the default), the
#'    least square optimization is performed as described in
#'    \href{https://elifesciences.org/articles/26476}{
#'     \cite{Racle et al., 2017, eLife}}, which is recommanded.
#'    When this variable is TRUE, EPIC uses the variability of each gene
#'    from the reference profiles in another way: instead of defining weights
#'    (based on the variability) for the fit of each gene, we define a range of
#'    values accessible for each gene (based on the gene expression value in
#'    the reference profile +/- the variability values). The
#'    error that the optimization tries to minimize is by how much
#'    the predicted gene expression is outside of this allowed range of values.
#'
#' @return A list of 3 matrices:
#' - `mRNAProportions`: (\code{nSamples} x (\code{nCellTypes+1})) the
#' proportion of mRNA coming from all cell types with a ref profile + the
#' uncharacterized other cell
#' - `cellFractions`: (\code{nSamples} x (\code{nCellTypes+1})) this gives the
#' proportion of cells from each cell type after accounting for the mRNA /
#' cell value
#' - `fit.gof`: (\code{nSamples} x 12) a matrix telling the quality for the
#' fit of the signature genes in each sample. It tells if the minimization
#' converged, and other info about this fit comparing the measured gene
#' expression in the sigGenes vs predicted gene expression in the sigGenes
#'
#' @examples
#' melanoma_counts <- matrix(abs(rnorm(80)), nrow = 20, ncol = 4)
#' rownames(melanoma_counts) <- paste0("Gene", 1:20)
#' colnames(melanoma_counts) <- paste0("Sample", 1:4)
#' mock_ref <- list(
#'   refProfiles = matrix(abs(rnorm(80)), nrow = 20, ncol = 4),
#'   sigGenes = paste0("Gene", 1:10)
#' )
#' rownames(mock_ref$refProfiles) <- paste0("Gene", 1:20)
#' colnames(mock_ref$refProfiles) <- c("Bcells", "CD4T", "CD8T", "NK")
#' res1 <- EPIC(melanoma_counts, reference = mock_ref)
#' if (!is.null(res1)) head(res1$cellFractions)
#'
#' @export
EPIC <- function(bulk, reference = NULL, mRNA_cell = NULL, mRNA_cell_sub = NULL,
                 sigGenes = NULL, scaleExprs = TRUE, withOtherCells = TRUE,
                 constrainedSum = TRUE, rangeBasedOptim = FALSE) {

  if (is.null(bulk)) return(NULL)

  # Checking the correct format of the bulk sample input
  if (!is.matrix(bulk) && !is.data.frame(bulk)) {
    stop("'bulk' needs to be given as a matrix or data.frame")
  }

  # First get the value of the reference profiles depending on the input
  # 'reference'.
  with_w <- TRUE
  if (is.null(reference)) {
    reference <- load_data("TRef")
    if (is.null(reference)) return(NULL)
  } else if (is.character(reference)) {
    if (reference %in% c("TRef", "BRef")) {
      reference <- load_data(reference)
      if (is.null(reference)) return(NULL)
      # Replace the char defining the reference name by the corresponding
      # pre-built reference values.
    } else {
      stop(
        "The reference, '", reference, "' is not part of the allowed ",
        "references:", paste(c("TRef", "BRef"), collapse = ", ")
      )
    }
  } else if (is.list(reference)) {
    refListNames <- names(reference)
    if ((!all(c("refProfiles", "sigGenes") %in% refListNames)) ||
      (("refProfiles" %in% refListNames) && !is.null(sigGenes))) {
      stop(
        "Reference, when given as a list needs to contain at least the ",
        "fields 'refProfiles' and 'sigGenes' (sigGenes could also be ",
        "given as input to EPIC instead)"
      )
    }
    if (!is.matrix(reference$refProfiles) &&
      !is.data.frame(reference$refProfiles)) {
      stop(
        "'reference$refProfiles' needs to be given as a matrix or data.frame"
      )
    }
    if (!("refProfiles.var" %in% refListNames)) {
      warning(
        "'refProfiles.var' not defined; using identical weights ",
        "for all genes"
      )
      with_w <- FALSE
    } else if (!is.matrix(reference$refProfiles.var) &&
      !is.data.frame(reference$refProfiles.var)) {
      stop(
        "'reference$refProfiles.var' needs to be given as a matrix or ",
        "data.frame when present."
      )
    } else if (!identical(
      dim(reference$refProfiles.var),
      dim(reference$refProfiles)
    ) ||
      !identical(
        dimnames(reference$refProfiles.var),
        dimnames(reference$refProfiles)
      )) {
      stop(
        "The dimensions and dimnames of 'reference$refProfiles' and ",
        "'reference$refProfiles.var' need to be the same"
      )
    }
  } else {
    stop("Unknown format for 'reference'")
  }

  bulk <- merge_duplicates(bulk, in_type = "bulk samples")
  refProfiles <- merge_duplicates(reference$refProfiles,
    in_type = "reference profiles"
  )
  if (with_w) {
    refProfiles.var <- merge_duplicates(reference$refProfiles.var, warn = FALSE)
    # Don't warn here as we're already warning for refProfile and they had same
    # dim names.
  } else {
    refProfiles.var <- 0
  }

  nSamples <- NCOL(bulk)
  samplesNames <- colnames(bulk)
  if (is.null(samplesNames)) {
    samplesNames <- 1:nSamples
    colnames(bulk) <- samplesNames
  }
  nRefCells <- NCOL(refProfiles)
  refCellsNames <- colnames(refProfiles)


  # Keeping only common genes and normalizing the counts based on these common
  # genes
  bulk_NA <- apply(is.na(bulk), MARGIN = 1, FUN = all)
  if (any(bulk_NA)) {
    warning(sum(bulk_NA), " genes are NA in all bulk samples, removing these.")
    bulk <- bulk[!bulk_NA, ]
  }
  bulkGenes <- rownames(bulk)
  refGenes <- rownames(refProfiles)
  commonGenes <- intersect(bulkGenes, refGenes)

  if (is.null(sigGenes)) {
    sigGenes <- unique(reference$sigGenes)
  } # Keep only once in case of duplicates
  sigGenes <- sigGenes[sigGenes %in% commonGenes]
  nSigGenes <- length(sigGenes)
  if (nSigGenes < nRefCells) {
    stop(
      "There are only ", nSigGenes, " signature genes",
      " matching common genes between bulk and reference profiles,",
      " but there should be more signature genes than reference cells"
    )
  }

  if (scaleExprs) {
    if (length(commonGenes) < 2e3) {
      warning(
        "there are few genes in common between the bulk samples and ",
        "reference cells:", length(commonGenes), ", so the data scaling ",
        "might be an issue"
      )
    }
    # The value of 2e3 is arbitrary, but should be a respectable number for the
    # data renormalization.
    bulk <- scaleCounts(bulk, sigGenes, commonGenes)$counts
    temp <- scaleCounts(refProfiles, sigGenes, commonGenes)
    refProfiles <- temp$counts
    if (with_w) {
      refProfiles.var <- scaleCounts(refProfiles.var, sigGenes,
        normFact = temp$normFact
      )$counts
    }
    # the refProfiles.var is normalized by the same factors as refProfiles.
  } else {
    bulk <- bulk[sigGenes, ]
    refProfiles <- refProfiles[sigGenes, ]
    if (with_w) {
      refProfiles.var <- refProfiles.var[sigGenes, ]
    }
  }

  if (is.null(mRNA_cell)) {
    mRNA_cell <- load_data("mRNA_cell_default")
    if (is.null(mRNA_cell)) return(NULL)
  }

  if (!is.null(mRNA_cell_sub)) {
    if (is.null(names(mRNA_cell_sub)) || !is.numeric(mRNA_cell_sub)) {
      stop("When mRNA_cell_sub is given, it needs to be a named numeric vector")
    }
    mRNA_cell[names(mRNA_cell_sub)] <- mRNA_cell_sub
  }

  minFun <- function(x, A, b, w) {
    # Basic minimization function used to minimize the squared sum of the error
    # between the fit and observed value (A*x - b). We also give a weight, w,
    # for each gene to give more or less importance to the fit of each (can use
    # a value 1 if don't want to give any weight).
    return(sum((w * (A %*% x - b)^2), na.rm = TRUE))
  }
  minFun.range <- function(x, A, b, A.var) {
    # Other minimization function where we don't use weights but instead give
    # also the variability on A as input and we'll compute for each gene the
    # min / max value of the pred based on this variability to keep the smallest
    # value. I.e. we compute a range of the values that y_pred can take for each
    # gene based on the ref profile variability and when the y_true is within
    # this range then we return an error of 0 for this gene and if y_true is
    # outside of the range, we keep the "smallest error" (i.e. value most
    # nearby of the possible range).
    val.max <- (A + A.var) %*% x - b
    val.min <- (A - A.var) %*% x - b
    cErr <- rep(0, length(b))
    outOfRange <- (sign(val.max) * sign(val.min) == 1)
    cErr[outOfRange] <- pmin(abs(val.max[outOfRange]), abs(val.min[outOfRange]))
    return(sum(cErr, na.rm = TRUE))
  }

  if (with_w && !rangeBasedOptim) {
    # Computing the weight to give to each gene
    w <- rowSums(refProfiles / (refProfiles.var + 1e-12), na.rm = TRUE)
    # 1e-12 to avoid divisions by 0: like this if refProfiles and
    # refProfiles.var both are 0 for a given element, it will result in a weight
    # of 0.
    med_w <- stats::median(w[w > 0], na.rm = TRUE)
    w[w > 100 * med_w] <- 100 * med_w
    # Set a limit for the big w to still not give too much weights to these
    # genes
  } else {
    w <- 1
  }

  # Defining the constraints for the fit of the proportions.
  if (withOtherCells) {
    cMin <- 0
  } else {
    cMin <- 0.99
    # So that when we assume no cells without known reference profile are
    # present, we ask the sum of the mRNA proportions of known cells to be at
    # least 0.99.
  }
  cMax <- 1
  ui <- diag(nRefCells)
  ci <- rep(0, nRefCells)
  if (constrainedSum) {
    ui <- rbind(ui, rep(1, nRefCells), rep(-1, nRefCells))
    ci <- c(ci, cMin, -cMax)
  }
  # ui and ci define the constraints, in the form "ui %*% x - ci >= 0".
  # The first constraints are that the proportion of each cell must be
  # positive; then if constrainedSum is true, we want the sum of all proportions
  # to be bigger than cMin and this sum must also be smaller or equal to cMax.
  cInitProp <- (min(1, cMax) - 1e-5) / nRefCells
  # used as an initial guess for the optimized - start with all cell fractions
  # equal. We added the -1e-5 in cInitProp because the optimizer needs to have
  # the initial guess inside the admissible region and not on its boundary

  # Estimating for each sample the proportion of the mRNA per cell type.
  tempPropPred <- lapply(1:nSamples, FUN = function(cSample) {
    b <- bulk[, cSample]
    if (!rangeBasedOptim) {
      fit <- stats::constrOptim(
        theta = rep(cInitProp, nRefCells), f = minFun,
        grad = NULL, ui = ui, ci = ci, A = refProfiles, b = b, w = w
      )
    } else {
      fit <- stats::constrOptim(
        theta = rep(cInitProp, nRefCells), f = minFun.range,
        grad = NULL, ui = ui, ci = ci, A = refProfiles,
        b = b, A.var = refProfiles.var
      )
    }
    fit$x <- fit$par
    if (!withOtherCells) {
      fit$x <- fit$x / sum(fit$x, na.rm = TRUE)
    }
    # So that the sum is really equal to 1 even if the best pred was giving
    # slightly lower values when we force the system to have only known cells.

    # Checking how well the estimated proportions predict the gene expression
    b_estimated <- refProfiles %*% fit$x

    if (nSigGenes > 2) {
      suppressWarnings(
        corSp.test <- stats::cor.test(b, b_estimated, method = "spearman")
      )
      # Use suppressWarnings to avoid the warnings that p-value isn't exact when
      # ties are present in the data.
      corPear.test <- stats::cor.test(b, b_estimated, method = "pearson")
    } else {
      # cannot compute correlations with less than 3 observations.
      corSp.test <- corPear.test <- list()
      corSp.test$estimate <- corSp.test$p.value <- corPear.test$estimate <-
        corPear.test$p.value <- NA
    }
    regLine <- stats::lm(b_estimated ~ b)
    regLine_through0 <- stats::lm(b_estimated ~ b + 0)
    if (!rangeBasedOptim) {
      rmse_pred <- sqrt(
        minFun(x = fit$x, A = refProfiles, b = b, w = w) / nSigGenes
      )
      rmse_0 <- sqrt(
        minFun(x = rep(0, nRefCells), A = refProfiles, b = b, w = w) / nSigGenes
      )
    } else {
      rmse_pred <- sqrt(minFun.range(
        x = fit$x, A = refProfiles, b = b,
        A.var = refProfiles.var
      ) / nSigGenes)
      rmse_0 <- sqrt(minFun.range(
        x = rep(0, nRefCells), A = refProfiles, b = b,
        A.var = refProfiles.var
      ) / nSigGenes)
    }
    gof <- data.frame(
      fit$convergence,
      ifelse(is.null(fit$message), "", fit$message),
      rmse_pred, rmse_0,
      # to only have sum((w*b)^2) or corresponding value based
      # on the minFun, in the worst case possible.
      corSp.test$estimate, corSp.test$p.value,
      corPear.test$estimate, corPear.test$p.value,
      regLine$coefficients[2], regLine$coefficients[1],
      regLine_through0$coefficients[1], sum(fit$x),
      stringsAsFactors = FALSE
    )

    return(list(mRNAProportions = fit$x, fit.gof = gof))
  })
  mRNAProportions <- do.call(rbind, lapply(
    tempPropPred,
    function(x) x$mRNAProportions
  ))
  dimnames(mRNAProportions) <- list(samplesNames, refCellsNames)

  fit.gof <- do.call(rbind, lapply(tempPropPred, function(x) x$fit.gof))
  dimnames(fit.gof) <- list(samplesNames, c(
    "convergeCode", "convergeMessage", "RMSE_weighted",
    "Root_mean_squared_geneExpr_weighted",
    "spearmanR", "spearmanP", "pearsonR", "pearsonP", "regline_a_x",
    "regline_b", "regline_a_x_through0", "sum_mRNAProportions"
  ))
  # Some matrix giving information on the goodness of the fit.

  if (any(fit.gof$convergeCode != 0)) {
    warning(
      "The optimization didn't fully converge for some samples:\n",
      paste(samplesNames[fit.gof$convergeCode != 0], collapse = "; "),
      "\n - check fit.gof for the convergeCode and convergeMessage"
    )
  }

  if (withOtherCells) {
    mRNAProportions <- cbind(
      mRNAProportions,
      otherCells = 1 - rowSums(mRNAProportions)
    )
  }
  # Adding a row to the proportion matrix corresponding to the other /
  # uncharacterized / cancer cells.

  tInds <- match(colnames(mRNAProportions), names(mRNA_cell))
  if (anyNA(tInds)) {
    defaultInd <- match("default", names(mRNA_cell))
    if (is.na(defaultInd)) {
      tStr <- paste(
        " and no default value is given for this mRNA per cell,",
        "so we cannot estimate the cellFractions, only",
        "the mRNA proportions"
      )
    } else {
      tStr <- paste(
        " - using the default value of", mRNA_cell[defaultInd],
        "for these but this might bias the true cell proportions from",
        "all cell types."
      )
    }
    warning(
      "mRNA_cell value unknown for some cell types: ",
      paste(colnames(mRNAProportions)[is.na(tInds)], collapse = ", "),
      tStr
    )
    tInds[is.na(tInds)] <- defaultInd
  }
  cellFractions <- t(t(mRNAProportions) / mRNA_cell[tInds])
  cellFractions <- cellFractions / rowSums(cellFractions, na.rm = FALSE)
  # So that the cell fractions sum up to 1. In case some values were NA (due
  # to unknown mRNA_cell value for some cell types without default value), then
  # the full cellFractions will be NA - if we used na.rm=T, then we'd have the
  # sum of the other than "NA" cells equal to 1 as if this "NA" cell was not
  # present in the sample.

  return(list(
    mRNAProportions = mRNAProportions, cellFractions = cellFractions,
    fit.gof = fit.gof
  ))
}

#' Scaling raw counts from each sample.
#'
#' Normalizing the sum of counts from each sample to 1e6.
#'
#' Function taking a matrix (\emph{genes} x \emph{samples}) of counts as input
#' and returning the scaled counts for the subset signature genes (or all genes
#' if it sigGenes is \code{NULL}), with the scaled counts computed based on all
#' the 'renormGenes' (or all genes if it is NULL). The renormalization is made
#' independently for each sample, to have the sum of each columns over the
#' renormGenes equal to 1e6.
#'
#' normFact, if not null, is used as the normalization factor instead of
#'  the colSums (used to renormalize the refProfiles.var by the same amount
#'  than the refProfiles).
#'
#' @keywords internal
scaleCounts <- function(counts, sigGenes = NULL, renormGenes = NULL,
                        normFact = NULL) {
  if (is.null(sigGenes)) {
    sigGenes <- 1:nrow(counts)
  }

  if (is.null(normFact)) {
    if (is.null(renormGenes)) {
      renormGenes <- 1:nrow(counts)
    }
    normFact <- colSums(counts[renormGenes, , drop = FALSE], na.rm = TRUE)
  }
  counts <- t(t(counts[sigGenes, , drop = FALSE]) / normFact) * 1e6
  # Need to take the transpose so that the division is made on the correct
  # elements
  return(list(counts = counts, normFact = normFact))
}

#' Merging the duplicates from the input matrix.
#'
#' In case there are some duplicate rownames in the input matrix (\emph{mat}),
#' this function will return a similar matrix with unique rownames and the
#' duplicate cases will be merged together based on the median values. When
#' \emph{warn} is true a warning message will be written in case there are
#' duplicates, this warning message will include the \emph{in_type} string to
#' indicate the current type of the input matrix.
#'
#' @param mat matrix
#' @param warn warning
#' @param in_type default is null
#'
#' @keywords internal
merge_duplicates <- function(mat, warn = TRUE, in_type = NULL) {
  dupl <- duplicated(rownames(mat))
  if (sum(dupl) > 0) {
    dupl_genes <- unique(rownames(mat)[dupl])
    if (warn) {
      warning(
        "There are ", length(dupl_genes), " duplicated gene names",
        ifelse(!is.null(in_type), paste(" in the", in_type), ""),
        ". We'll use the median value for each of these cases."
      )
    }
    mat_dupl <- mat[rownames(mat) %in% dupl_genes, , drop = FALSE]
    mat_dupl_names <- rownames(mat_dupl)
    mat <- mat[!dupl, , drop = FALSE]
    # First put the dupl cases in a separate matrix and keep only the unique
    # gene names in the mat matrix.
    mat[dupl_genes, ] <- t(sapply(dupl_genes, FUN = function(cgene) {
      apply(mat_dupl[mat_dupl_names == cgene, , drop = FALSE],
        MARGIN = 2, FUN = median
      )
    }))
  }
  return(mat)
}