#' Evaluate correlation
#'
#' The loss-function learning digital tissue deconvolution finds a vector g
#' which minimizes the Loss function L\cr
#' \deqn{L(g) = - \sum cor(true_C, estimatd_C(g))}
#' The evaluate_cor function returns the value of the Loss function.
#'
#' @param X.matrix numeric matrix, with features/genes as rows,
#' and cell types as column. Each column of X.matrix is a reference
#' expression profile. A trained DTD model includes X.matrix, it has been
#' trained on. Therefore, X.matrix should only be set, if the 'DTD.model'
#' is not a DTD model.
#' @param new.data numeric matrix with samples as columns,
#' and features/genes as rows.
#' @param DTD.model either a numeric vector with length of nrow(X), or a list
#' returned by \code{\link{train_deconvolution_model}},
#' \code{\link{DTD_cv_lambda_cxx}}, or \code{\link{descent_generalized_fista}}.
#' In the equation above the DTD.model provides the vector g.
#' @param true.compositions numeric matrix with cells as rows,
#' and mixtures as columns. In the formula above named true_C.
#' Each row of C holds the distribution of the cell over all mixtures.
#' @param estimate.c.type string, either "non_negative", or "direct".
#' Indicates how the algorithm finds the solution of
#' \eqn{arg min_C ||diag(g)(Y - XC)||_2}.
#' \itemize{
#' \item If 'estimate.c.type' is set to "direct",
#' there is no regularization (see \code{\link{estimate_c}}),
#' \item if 'estimate.c.type' is set to "non_negative",
#' the estimates "C" must not be negative (non-negative least squares)
#' (see (see \code{\link{estimate_nn_c}}))
#' }
#' @return float, value of the Loss function
#'
#' @export
#' @examples
#' library(DTD)
#' random.data <- generate_random_data(
#' n.types = 10,
#' n.samples.per.type = 150,
#' n.features = 250,
#' sample.type = "Cell",
#' feature.type = "gene"
#' )
#'
#' # normalize all samples to the same amount of counts:
#' normalized.data <- normalize_to_count(random.data)
#'
#' # extract indicator list.
#' # This list contains the Type of the sample as value, and the sample name as name
#' indicator.list <- gsub("^Cell[0-9]*\\.", "", colnames(random.data))
#' names(indicator.list) <- colnames(random.data)
#'
#' # extract reference matrix X
#' # First, decide which cells should be deconvoluted.
#' # Notice, in the mixtures there can be more cells than in the reference matrix.
#' include.in.X <- paste0("Type", 2:7)
#'
#' percentage.of.all.cells <- 0.2
#' sample.X <- sample_random_X(
#' included.in.X = include.in.X,
#' pheno = indicator.list,
#' expr.data = normalized.data,
#' percentage.of.all.cells = percentage.of.all.cells
#' )
#' X.matrix <- sample.X$X.matrix
#' samples.to.remove <- sample.X$samples.to.remove
#' remaining.mat <- normalized.data[, -which(colnames(normalized.data) %in% samples.to.remove)]
#'
#' indicator.remain <- indicator.list[names(indicator.list) %in% colnames(remaining.mat)]
#' training.data <- mix_samples(
#' expr.data = remaining.mat,
#' pheno = indicator.remain,
#' included.in.X = include.in.X,
#' n.samples = 500,
#' n.per.mixture = 100,
#' verbose = FALSE
#' )
#'
#' start.tweak <- rep(1, nrow(X.matrix))
#' sum.cor <- evaluate_cor(
#' X.matrix = X.matrix
#' , new.data = training.data$mixtures
#' , true.compositions = training.data$quantities
#' , DTD.model = start.tweak
#' , estimate.c.type = "direct"
#' )
#'
#' rel.cor <- sum.cor/ncol(X.matrix)
#' cat("Relative correlation: ", -rel.cor, "\n")
evaluate_cor <- function(
X.matrix = NA,
new.data,
true.compositions,
DTD.model,
estimate.c.type
){
# the reference matrix can either be included in the DTD.model, or has to be passed
# via the X.matrix argument:
if(is.matrix(X.matrix) && !any(is.na(X.matrix))){
X <- X.matrix
}else{
if(is.list(DTD.model) && "reference.X" %in% names(DTD.model)){
if("reference.X" %in% names(DTD.model)){
X <- DTD.model$reference.X
}
if("best.model" %in% names(DTD.model)){
if("reference.X" %in% names(DTD.model$best.model)){
X <- DTD.model$best.model$reference.X
}
}
}
}
if(!exists("X")){
stop("In evaluate_cor: 'X.matrix' must be provided either as the 'X.matrix' argument, or within the DTD.model")
}
# as a DTD.model either a list, or only the tweak vector can be used:
if (is.list(DTD.model)) {
if ("best.model" %in% names(DTD.model)) {
if("Tweak" %in% names(DTD.model$best.model)){
gamma.vec <- DTD.model$best.model$Tweak
}
}else {
if ("Tweak" %in% names(DTD.model)) {
gamma.vec <- DTD.model$Tweak
}else {
stop("In evaluate_cor: DTD.model does not fit")
}
}
if ("estimate.c.type" %in% names(DTD.model)){
estimate.c.type <- DTD.model$estimate.c.type
}
}else {
gamma.vec <- DTD.model
}
if(length(gamma.vec) != nrow(X)){
stop("In evaluate_cor: 'DTD.model' does not fit 'X.matrix'. Check if the provided model includes as many 'Tweak' entries as there are features (=> rows) in 'X.matrix'")
}
if(!is.null(names(gamma.vec))){
if(all(rownames(X) %in% names(gamma.vec))){
gamma.vec <- gamma.vec[rownames(X)]
}else{
stop("In evaluate_cor: There are features within 'X.matrix' where no tweak/g entry in the 'DTD.model' can be found")
}
}
if(!is.numeric(gamma.vec)){
stop("In evaluate_cor: Used 'Tweak' (from 'DTD.model') is not numeric")
}
if(nrow(true.compositions) != ncol(X)){
stop("In evaluate_cor: 'nrow(true.composition)' does not match 'ncol(X.matrix)'")
}
if(ncol(true.compositions) != ncol(new.data)){
stop("In evaluate_cor: 'ncol(true.composition)' does not match 'ncol(new.data)' ")
}
if(all(colnames(true.compositions) %in% colnames(new.data))){
true.compositions <- true.compositions[, colnames(new.data), drop = FALSE]
}else{
stop("In evaluate_cor: 'colnames(true.compositions)' do not match 'colnames(new.data)'")
}
if(all(rownames(true.compositions) %in% colnames(X))){
true.compositions <- true.compositions[colnames(X), , drop = FALSE]
}else{
stop("In evaluate_cor: 'rownames(true.compositions)' do not match 'colnames(X.matrix)'")
}
if(all(rownames(new.data) %in% rownames(X))){
new.data <- new.data[rownames(X), , drop = FALSE]
}else{
stop("in evaluate_cor: not all'rownames(new.data)' are in 'rownames(X.matrix)'")
}
Y <- new.data
C <- true.compositions
# estimate C using reference matrix, bulks and the tweak vector:
esti.cs <- estimate_c(
X.matrix = X
, new.data = Y
, DTD.model = gamma.vec
, estimate.c.type = estimate.c.type
)
if(any(dim(esti.cs) != dim(C))){
stop("evaluate_cor: dimension of estimated C do not fit")
}
# initialise loss:
loss <- 0
for(l1 in 1:nrow(C)){
if(stats::sd(esti.cs[l1, ], na.rm = TRUE) != 0){
# calculate the correlation per Type and add them up
loss <- loss + stats::cor(C[l1, ], esti.cs[l1, ])
}
}
# Our loss function is set to be a minimization problem, therefore we invert the sign:
loss <- -loss
return(loss)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.