# ####################################'
#
# Package EPIC - developed by Julien Racle from David Gfeller's group of the
# University of Lausanne and Ludwig Institute for Cancer Research. Copyrights
# remain reserved as described in the included LICENSE file.
#
# ####################################'
#' Estimate the proportion of immune and cancer cells.
#'
#' \code{EPIC} takes as input bulk gene expression data (RNA-seq) and returns
#' the proportion of mRNA and cells composing the various samples.
#'
#' This function uses a constrained least square minimization to estimate the
#' proportion of each cell type with a reference profile and another
#' uncharacterized cell type in bulk gene expression samples.
#'
#' The names of the genes in the bulk samples, the reference samples and in the
#' gene signature list need to be the same format (gene symbols are used in the
#' predefined reference profiles). The full list of gene names don't need to be
#' exactly the same between the reference and bulk samples: \emph{EPIC}
#' will use the intersection of the genes. In case of duplicate gene names,
#' \emph{EPIC} will use the median value per duplicate - if you want to consider
#' these cases differently, you can remove the duplicates before calling
#' \emph{EPIC}.
#'
#' @param bulk A matrix (\code{nGenes} x \code{nSamples}) of the genes
#' expression from each bulk sample (the counts should be given in TPM,
#' RPKM or FPKM when using the prebuilt reference profiles). This matrix
#' needs to have rownames telling the gene names (corresponds to the gene
#' symbol in the prebuilt reference profiles (e.g. CD8A, MS4A1) - no
#' conversion of IDs is performed at the moment).
#' It is advised to keep all genes in the bulk instead of a subset of
#' signature genes (except if \code{scaleExprs = FALSE} in which case it
#' doesn't make any difference).
#' @param reference (optional): A string or a list defining the reference cells.
#' It can take multiple formats, either: \itemize{
#' \item\code{NULL}: to use the default reference profiles and genes
#' signature \code{\link{TRef}}.
#' \item a char: either \emph{"BRef"} or \emph{"TRef"}
#' to use the reference cells and genes signature of the corresponding
#' datasets (see \code{\link{BRef}} and \code{\link{TRef}}).
#' \item a list. When a list it should include: \describe{
#' \item{\code{$refProfiles}}{a matrix (\code{nGenes} x \code{nCellTypes})
#' of the reference cells genes expression (don't include a column of
#' the 'other cells' (representing usually the cancer cells for which
#' such a profile is usually not conserved between samples);
#' the rownames needs to be defined as well as the colnames giving the
#' names of each gene and reference cell types respectively.
#' It is advised to keep all genes in this \code{refProfiles} matrix
#' instead of a subset of signature genes;
#' }
#' \item{\code{$sigGenes}}{a character vector of the gene names to use as
#' signature - sigGenes can also be given as a direct input to EPIC
#' function;}
#' \item{\code{$refProfiles.var}}{(optional): a matrix (\code{nGenes} x
#' \code{nCellTypes}) of the variability of each gene expression for each
#' cell type, which is used to define weights on each gene for the
#' optimization (if this is absent, we assume an identical variability
#' for all genes in all cells) - it needs to have the same dimnames than
#' refProfiles;}
#' }
#' }
#' @param mRNA_cell (optional): A named numeric vector: tells (in arbitrary
#' units) the amount of mRNA for each of the reference cells and of the
#' other uncharacterized (cancer) cell. Two names are of special meaning:
#' \emph{"otherCells"} - used for the mRNA/cell value of the "other cells"
#' from the sample (i.e. the cell type that don't have any reference gene
#' expression profile) ; and \emph{default} - used for the mRNA/cell of the
#' cells from the reference profiles for which no specific value is given
#' in mRNA_cell (i.e. if mRNA_cell=c(Bcells=2, NKcells=2.1, otherCells=3.5,
#' default=1), then if the refProfiles described Bcells, NKcells and Tcells,
#' we would use a value of 3.5 for the "otherCells" that didn't have any
#' reference profile and a default value of 1 for the Tcells when computing
#' the cell fractions).
#' To note: if data is in tpm, this mRNA per cell would ideally correspond
#' to some number of transcripts per cell.
#' @param mRNA_cell_sub (optional): This can be given instead of \code{mRNA_cell} (or
#' in addition to it). It is also a named numeric vector, used to replace
#' only the mRNA/cell values from some cell types (or to add values for new
#' cell types). The values given in mRNA_cell_sub will overwrite the default
#' values as well as those that might have been given by mRNA_cell.
#' @param sigGenes (optional): a character vector of the gene names to use as
#' signature for the deconvolution. In principle this is given with the
#' reference as the "reference$sigGenes" but if we give a value for this
#' input variable, it is these signature genes that will be used instead of
#' the ones given with the reference profile.
#' @param scaleExprs (optional, default is TRUE): boolean telling if the bulk
#' samples and reference gene expression profiles should be rescaled based on
#' the list of genes in common between the them (such a rescaling is
#' recommanded).
#' @param withOtherCells (optional, default is TRUE): if EPIC should allow for
#' an additional cell type for which no gene expression reference profile is
#' available or if the bulk is assumed to be composed only of the cells with
#' reference profiles.
#' @param constrainedSum (optional, default is TRUE): tells if the sum of all
#' cell types should be constrained to be < 1. When
#' \code{withOtherCells=FALSE}, there is additionally a constrain the the sum
#' of all cell types with reference profiles must be > 0.99.
#' @param rangeBasedOptim (optional): when this is FALSE (the default), the
#' least square optimization is performed as described in
#' \href{https://elifesciences.org/articles/26476}{
#' \cite{Racle et al., 2017, eLife}}, which is recommanded.
#' When this variable is TRUE, EPIC uses the variability of each gene
#' from the reference profiles in another way: instead of defining weights
#' (based on the variability) for the fit of each gene, we define a range of
#' values accessible for each gene (based on the gene expression value in
#' the reference profile +/- the variability values). The
#' error that the optimization tries to minimize is by how much
#' the predicted gene expression is outside of this allowed range of values.
#'
#' @return A list of 3 matrices:\describe{
#' \item{\code{mRNAProportions}}{(\code{nSamples} x (\code{nCellTypes+1})) the
#' proportion of mRNA coming from all cell types with a ref profile + the
#' uncharacterized other cell. Please note that if working with reconstructed
#' in silico bulk samples built for example from single-cell RNA-seq data,
#' then you should compare the 'true' proportions against these
#' 'mRNAProportions', while if working with true bulk samples, then you should
#' compare the cell proportions against the 'cellFractions'.}
#' \item{\code{cellFractions}}{(\code{nSamples} x (\code{nCellTypes+1})) this
#' gives the proportion of cells from each cell type after accounting for
#' the mRNA / cell value.}
#' \item{\code{fit.gof}}{(\code{nSamples} x 12) a matrix telling the quality
#' for the fit of the signature genes in each sample. It tells if the
#' minimization converged, and other info about this fit comparing the
#' measured gene expression in the sigGenes vs predicted gene expression in
#' the sigGenes.}
#' }
#'
#' @examples
#' res1 <- EPIC(melanoma_data$counts)
#' res1$cellFractions
#' res2 <- EPIC(melanoma_data$counts, TRef)
#' res3 <- EPIC(bulk=melanoma_data$counts, reference=TRef)
#' res4 <- EPIC(melanoma_data$counts, reference="TRef")
#' res5 <- EPIC(melanoma_data$counts, mRNA_cell_sub=c(Bcells=1, otherCells=5))
#' # Various possible ways of calling EPIC function. res 1 to 4 should
#' # give exactly the same outputs, and the elements res1$cellFractions
#' # should be equal to the example predictions found in
#' # melanoma_data$cellFractions.pred for these first 4 results.
#' # The values of cellFraction for res5 will be different due to the use of
#' # other mRNA per cell values for the B and other cells.
#'
#' @export
EPIC <- function(bulk, reference=NULL, mRNA_cell=NULL, mRNA_cell_sub=NULL,
sigGenes=NULL, scaleExprs=TRUE, withOtherCells=TRUE,
constrainedSum=TRUE, rangeBasedOptim=FALSE){
# Checking the correct format of the bulk sample input
if (!is.matrix(bulk) && !is.data.frame(bulk))
stop("'bulk' needs to be given as a matrix or data.frame")
# First get the value of the reference profiles depending on the input
# 'reference'.
with_w <- TRUE
if (is.null(reference)){
reference <- EPIC::TRef
} else if (is.character(reference)){
if (reference %in% prebuiltRefNames){
reference <- get(reference, pos="package:EPIC")
# Replace the char defining the reference name by the corresponding
# pre-built reference values.
} else
stop("The reference, '", reference, "' is not part of the allowed ",
"references:", paste(prebuiltRefNames, collapse=", "))
} else if (is.list(reference)){
refListNames <- names(reference)
if ( (!all(c("refProfiles", "sigGenes") %in% refListNames)) ||
(("refProfiles" %in% refListNames) && !is.null(sigGenes)) )
stop("Reference, when given as a list needs to contain at least the ",
"fields 'refProfiles' and 'sigGenes' (sigGenes could also be ",
"given as input to EPIC instead)")
if (!is.matrix(reference$refProfiles) && !is.data.frame(reference$refProfiles))
stop("'reference$refProfiles' needs to be given as a matrix or data.frame")
if (!("refProfiles.var" %in% refListNames)){
warning("'refProfiles.var' not defined; using identical weights ",
"for all genes")
with_w <- FALSE
} else if (!is.matrix(reference$refProfiles.var) &&
!is.data.frame(reference$refProfiles.var)){
stop("'reference$refProfiles.var' needs to be given as a matrix or ",
"data.frame when present.")
} else if (!identical(dim(reference$refProfiles.var), dim(reference$refProfiles))
|| !identical(dimnames(reference$refProfiles.var),
dimnames(reference$refProfiles)))
stop("The dimensions and dimnames of 'reference$refProfiles' and ",
"'reference$refProfiles.var' need to be the same")
} else {
stop("Unknown format for 'reference'")
}
bulk <- merge_duplicates(bulk, in_type="bulk samples")
refProfiles <- merge_duplicates(reference$refProfiles,
in_type="reference profiles")
if (with_w){
refProfiles.var <- merge_duplicates(reference$refProfiles.var, warn=F)
# Don't warn here as we're already warning for refProfile and they had same
# dim names.
} else {
refProfiles.var <- 0
}
nSamples <- NCOL(bulk); samplesNames <- colnames(bulk)
if (is.null(samplesNames)){
samplesNames <- 1:nSamples
colnames(bulk) <- samplesNames
}
nRefCells <- NCOL(refProfiles); refCellsNames <- colnames(refProfiles)
# Keeping only common genes and normalizing the counts based on these common
# genes
bulk_NA <- apply(is.na(bulk), MARGIN=1, FUN=all)
if (any(bulk_NA)){
warning(sum(bulk_NA), " genes are NA in all bulk samples, removing these.")
bulk <- bulk[!bulk_NA,]
}
bulkGenes <- rownames(bulk)
refGenes <- rownames(refProfiles)
commonGenes <- intersect(bulkGenes, refGenes)
if (is.null(sigGenes))
sigGenes <- unique(reference$sigGenes) # Keep only once in case of duplicates
sigGenes <- sigGenes[sigGenes %in% commonGenes]
nSigGenes <- length(sigGenes)
if (nSigGenes < nRefCells)
stop("There are only ", nSigGenes, " signature genes",
" matching common genes between bulk and reference profiles,",
" but there should be more signature genes than reference cells")
if (scaleExprs){
if (length(commonGenes) < 2e3)
warning("there are few genes in common between the bulk samples and ",
"reference cells:", length(commonGenes), ", so the data scaling ",
"might be an issue")
# The value of 2e3 is arbitrary, but should be a respectable number for the
# data renormalization.
bulk <- scaleCounts(bulk, sigGenes, commonGenes)$counts
temp <- scaleCounts(refProfiles, sigGenes, commonGenes)
refProfiles <- temp$counts
if (with_w)
refProfiles.var <- scaleCounts(refProfiles.var, sigGenes,
normFact=temp$normFact)$counts
# the refProfiles.var is normalized by the same factors as refProfiles.
} else {
bulk <- bulk[sigGenes,]
refProfiles <- refProfiles[sigGenes,]
if (with_w)
refProfiles.var <- refProfiles.var[sigGenes,]
}
if (is.null(mRNA_cell))
mRNA_cell <- EPIC::mRNA_cell_default
if (!is.null(mRNA_cell_sub)){
if (is.null(names(mRNA_cell_sub)) || !is.numeric(mRNA_cell_sub))
stop("When mRNA_cell_sub is given, it needs to be a named numeric vector")
mRNA_cell[names(mRNA_cell_sub)] <- mRNA_cell_sub
}
minFun <- function(x, A, b, w){
# Basic minimization function used to minimize the squared sum of the error
# between the fit and observed value (A*x - b). We also give a weight, w,
# for each gene to give more or less importance to the fit of each (can use
# a value 1 if don't want to give any weight).
return(sum( (w * (A %*% x - b)^2 ), na.rm = TRUE))
}
minFun.range <- function(x, A, b, A.var){
# Other minimization function where we don't use weights but instead give
# also the variability on A as input and we'll compute for each gene the
# min / max value of the pred based on this variability to keep the smallest
# value. I.e. we compute a range of the values that y_pred can take for each
# gene based on the ref profile variability and when the y_true is within
# this range then we return an error of 0 for this gene and if y_true is
# outside of the range, we keep the "smallest error" (i.e. value most
# nearby of the possible range).
val.max <- (A + A.var) %*%x - b
val.min <- (A - A.var) %*%x - b
cErr <- rep(0, length(b))
outOfRange <- (sign(val.max)*sign(val.min) == 1)
cErr[outOfRange] <- pmin(abs(val.max[outOfRange]), abs(val.min[outOfRange]))
return(sum(cErr, na.rm = TRUE))
}
if (with_w && !rangeBasedOptim){
# Computing the weight to give to each gene
w <- rowSums(refProfiles / (refProfiles.var + 1e-12), na.rm=TRUE)
# 1e-12 to avoid divisions by 0: like this if refProfiles and refProfiles.var
# both are 0 for a given element, it will result in a weight of 0.
med_w <- stats::median(w[w>0], na.rm=TRUE)
w[w> 100*med_w] <- 100*med_w
# Set a limit for the big w to still not give too much weights to these genes
} else
w <- 1
# Defining the constraints for the fit of the proportions.
if (withOtherCells){
cMin <- 0
} else {
cMin <- 0.99
# So that when we assume no cells without known reference profile are
# present, we ask the sum of the mRNA proportions of known cells to be at
# least 0.99.
}
cMax <- 1
ui <- diag(nRefCells)
ci <- rep(0,nRefCells)
if (constrainedSum){
ui <- rbind(ui, rep(1, nRefCells), rep(-1, nRefCells))
ci <- c(ci, cMin, -cMax)
}
# ui and ci define the constraints, in the form "ui %*% x - ci >= 0".
# The first constraints are that the proportion of each cell must be
# positive; then if constrainedSum is true, we want the sum of all proportions
# to be bigger than cMin and this sum must also be smaller or equal to cMax.
cInitProp <- (min(1, cMax)-1e-5) / nRefCells
# used as an initial guess for the optimized - start with all cell fractions
# equal. We added the -1e-5 in cInitProp because the optimizer needs to have
# the initial guess inside the admissible region and not on its boundary
# Estimating for each sample the proportion of the mRNA per cell type.
tempPropPred <- lapply(1:nSamples, FUN=function(cSample){
b <- bulk[,cSample]
if (!rangeBasedOptim){
fit <- stats::constrOptim(theta = rep(cInitProp, nRefCells), f=minFun,
grad=NULL, ui=ui, ci=ci, A=refProfiles, b=b, w=w)
} else {
fit <- stats::constrOptim(theta = rep(cInitProp, nRefCells), f=minFun.range,
grad=NULL, ui=ui, ci=ci, A=refProfiles, b=b, A.var=refProfiles.var)
}
fit$x <- fit$par
if (!withOtherCells)
fit$x <- fit$x / sum(fit$x, na.rm=TRUE)
# So that the sum is really equal to 1 even if the best pred was giving
# slightly lower values when we force the system to have only known cells.
# Checking how well the estimated proportions predict the gene expression
b_estimated <- refProfiles %*% fit$x
if (nSigGenes > 2){
suppressWarnings(corSp.test <- stats::cor.test(b, b_estimated, method="spearman"))
# Use suppressWarnings to avoid the warnings that p-value isn't exact when
# ties are present in the data.
corPear.test <- stats::cor.test(b, b_estimated, method="pearson")
} else {
# cannot compute correlations with less than 3 observations.
corSp.test <- corPear.test <- list()
corSp.test$estimate <- corSp.test$p.value <- corPear.test$estimate <-
corPear.test$p.value <- NA
}
regLine <- stats::lm(b_estimated ~ b)
regLine_through0 <- stats::lm(b_estimated ~ b+0)
if (!rangeBasedOptim){
rmse_pred <- sqrt(minFun(x=fit$x, A=refProfiles, b=b, w=w)/nSigGenes)
rmse_0 <- sqrt(minFun(x=rep(0,nRefCells), A=refProfiles, b=b, w=w)/nSigGenes)
} else {
rmse_pred <- sqrt(minFun.range(x=fit$x, A=refProfiles, b=b,
A.var=refProfiles.var)/nSigGenes)
rmse_0 <- sqrt(minFun.range(x=rep(0,nRefCells), A=refProfiles, b=b,
A.var=refProfiles.var)/nSigGenes)
}
gof <- data.frame(fit$convergence, ifelse(is.null(fit$message), "", fit$message),
rmse_pred, rmse_0,
# to only have sum((w*b)^2) or corresponding value based
# on the minFun, in the worst case possible.
corSp.test$estimate, corSp.test$p.value,
corPear.test$estimate, corPear.test$p.value,
regLine$coefficients[2], regLine$coefficients[1],
regLine_through0$coefficients[1], sum(fit$x),
stringsAsFactors=FALSE)
return(list(mRNAProportions=fit$x, fit.gof=gof))
} )
mRNAProportions <- do.call(rbind, lapply(tempPropPred,
function(x) x$mRNAProportions))
dimnames(mRNAProportions) <- list(samplesNames, refCellsNames)
fit.gof <- do.call(rbind, lapply(tempPropPred, function(x) x$fit.gof))
dimnames(fit.gof) <- list(samplesNames, c(
"convergeCode", "convergeMessage", "RMSE_weighted",
"Root_mean_squared_geneExpr_weighted",
"spearmanR", "spearmanP", "pearsonR", "pearsonP", "regline_a_x",
"regline_b", "regline_a_x_through0", "sum_mRNAProportions"))
# Some matrix giving information on the goodness of the fit.
if (any(fit.gof$convergeCode != 0))
warning("The optimization didn't fully converge for some samples:\n",
paste(samplesNames[fit.gof$convergeCode!=0], collapse="; "),
"\n - check fit.gof for the convergeCode and convergeMessage")
if (withOtherCells)
mRNAProportions <- cbind(mRNAProportions, otherCells=1-rowSums(mRNAProportions))
# Adding a row to the proportion matrix corresponding to the other /
# uncharacterized / cancer cells.
tInds <- match(colnames(mRNAProportions), names(mRNA_cell))
if (anyNA(tInds)){
defaultInd <- match("default", names(mRNA_cell))
if (is.na(defaultInd)){
warning("mRNA_cell value unknown for some cell types: ",
paste(colnames(mRNAProportions)[is.na(tInds)], collapse=", "),
" and no default value is given for the mRNA per cell, so we cannot ",
"estimate the cellFractions, only the mRNA proportions")
} else {
# warning("mRNA_cell value unknown for some cell types: ",
# paste(colnames(mRNAProportions)[is.na(tInds)], collapse=", "),
# " - using the default value of", mRNA_cell[defaultInd], " for these but ",
# "this might bias the true cell proportions from all cell types.")
# Not indicating this warning message as it comes about always if the
# user doesn't define additional mRNA_cell values by himself. Instead,
# I've indicated this warning in the documentation directly.
tInds[is.na(tInds)] <- defaultInd
}
}
cellFractions <- t( t(mRNAProportions) / mRNA_cell[tInds])
cellFractions <- cellFractions / rowSums(cellFractions, na.rm=FALSE)
# So that the cell fractions sum up to 1. In case some values were NA (due
# to unknown mRNA_cell value for some cell types without default value), then
# the full cellFractions will be NA - if we used na.rm=T, then we'd have the
# sum of the other than "NA" cells equal to 1 as if this "NA" cell was not
# present in the sample.
return(list(mRNAProportions=mRNAProportions, cellFractions=cellFractions,
fit.gof=fit.gof))
}
#' Scaling raw counts from each sample.
#'
#' Normalizing the sum of counts from each sample to 1e6.
#'
#' Function taking a matrix (\emph{genes} x \emph{samples}) of counts as input
#' and returning the scaled counts for the subset signature genes (or all genes
#' if it sigGenes is \code{NULL}), with the scaled counts computed based on all
#' the 'renormGenes' (or all genes if it is NULL). The renormalization is made
#' independently for each sample, to have the sum of each columns over the
#' renormGenes equal to 1e6.
#'
#' normFact, if not null, is used as the normalization factor instead of
#' the colSums (used to renormalize the refProfiles.var by the same amount
#' than the refProfiles).
#'
#' @keywords internal
scaleCounts <- function(counts, sigGenes=NULL, renormGenes=NULL, normFact=NULL){
if (is.null(sigGenes))
sigGenes <- 1:nrow(counts)
if (is.null(normFact)){
if (is.null(renormGenes))
renormGenes <- 1:nrow(counts)
normFact <- colSums(counts[renormGenes,,drop=FALSE], na.rm=TRUE)
}
counts <- t( t(counts[sigGenes,,drop=FALSE]) / normFact) * 1e6
# Need to take the transpose so that the division is made on the correct
# elements
return(list(counts=counts, normFact=normFact))
}
#' Merging the duplicates from the input matrix.
#'
#' In case there are some duplicate rownames in the input matrix (\emph{mat}),
#' this function will return a similar matrix with unique rownames and the
#' duplicate cases will be merged together based on the median values. When
#' \emph{warn} is true a warning message will be written in case there are
#' duplicates, this warning message will include the \emph{in_type} string to
#' indicate the current type of the input matrix.
#'
#' @keywords internal
merge_duplicates <- function(mat, warn=TRUE, in_type=NULL){
dupl <- duplicated(rownames(mat))
if (sum(dupl) > 0){
dupl_genes <- unique(rownames(mat)[dupl])
if (warn){
warning("There are ", length(dupl_genes), " duplicated gene names",
ifelse(!is.null(in_type), paste(" in the", in_type), ""),
" (e.g., ", paste0("'", dupl_genes[1:(min(5, length(dupl_genes)))],
"'", collapse=", "), "). We'll use the median value for ",
"each of these cases.")
}
mat_dupl <- mat[rownames(mat) %in% dupl_genes,,drop=F]
mat_dupl_names <- rownames(mat_dupl)
mat <- mat[!dupl,,drop=F]
# First put the dupl cases in a separate matrix and keep only the unique
# gene names in the mat matrix.
mat[match(dupl_genes, rownames(mat)),] <- t(sapply(dupl_genes, FUN=function(cgene)
apply(mat_dupl[mat_dupl_names == cgene,,drop=F], MARGIN=2, FUN=stats::median)))
}
return(mat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.