# Functions for clone ID
#' Infer clonal identity of single cells
#'
#' @param A variant x cell matrix of integers; number of alternative allele
#' reads in variant i cell j
#' @param D variant x cell matrix of integers; number of total reads covering
#' variant i cell j
#' @param Config variant x clone matrix of binary values. The clone-variant
#' configuration, which encodes the phylogenetic tree structure. This is the
#' output Z of Canopy
#' @param n_clone integer(1), the number of clone to reconstruct. This is in use
#' only if Config is NULL
#' @param relax_Config logical(1), If TRUE, relaxing the Clone Configuration by
#' changing it from fixed value to act as a prior Config with a relax rate.
#' @param relax_rate_fixed numeric(1), If the value is between 0 to 1,
#' the relax rate will be set as a fix value during updating clone Config. If
#' NULL, the relax rate will be learned automatically with relax_rate_prior.
#' @param n_chain integer(1), the number of chains to run, which will be
#' averaged as an output result
#' @param n_proc integer(1), the number of processors to use. This parallel
#' computing can largely reduce time when using multiple chains
#' @param inference character(1), the method to use for inference, either
#' "sampling" to use Gibbs sampling (default) or "EM" to use
#' expectation-maximization (faster)
#' @param verbose logical(1), should the function output verbose information as
#' it runs?
#' @param ... arguments passed to \code{\link{clone_id_Gibbs}} or
#' \code{\link{clone_id_EM}} (as appropriate)
#' @param Psi A vector of float. The fractions of each clone, output P of Canopy
#' @param min_iter A integer. The minimum number of iterations in the Gibbs
#' sampling. The real iteration may be longer until the convergence.
#' @param max_iter A integer. The maximum number of iterations in the Gibbs
#' sampling, even haven't passed the convergence diagnosis
#'
#' @return
#' If inference method is "EM", a list containing \code{theta}, a vector of
#' two floats denoting the parameters of the two components of the base model,
#' i.e., mean of Bernoulli or binomial model given variant exists or not,
#' \code{prob}, the matrix of posterior probabilities of each cell belonging to
#' each clone with fitted parameters, and \code{logLik}, the log likelihood of
#' the final parameters.
#'
#' If inference method is "sampling", a list containing: \code{theta0}, the mean
#' of sampled false positive parameter values; \code{theta1} the mean of sampled
#' (1 - false negative rate) parameter values; \code{theta0_all}, all sampled
#' false positive parameter values; \code{theta1_all}, all sampled (1 - false
#' negative rate) parameter values; \code{element}; \code{logLik_all},
#' log-likelihood for model for all sampled parameter sets; \code{prob_all};
#' \code{prob}, matrix with mean of sampled cell-clone assignment posterior
#' probabilities (the key output of the model); \code{prob_variant}.
#'
#' @details
#' The two Bernoulli components correspond to false positive and false negative
#' rates. The two binomial components correspond to the read distributions
#' with and without the mutation present.
#'
#' @author Yuanhua Huang and Davis McCarthy
#'
#' @import stats
#' @name Clone ID
#' @rdname clone_id
#' @export
#'
#' @examples
#' data(example_donor)
#' assignments <- clone_id(A_clone, D_clone,
#' Config = tree$Z,
#' min_iter = 800, max_iter = 1200
#' )
#' prob_heatmap(assignments$prob)
#'
#' assignments_EM <- clone_id(A_clone, D_clone,
#' Config = tree$Z,
#' inference = "EM"
#' )
#' prob_heatmap(assignments_EM$prob)
clone_id <- function(A, D, Config = NULL, n_clone = NULL, Psi = NULL,
relax_Config = TRUE, relax_rate_fixed = NULL,
inference = "sampling", n_chain = 1, n_proc = 1,
verbose = TRUE, ...) {
inference <- match.arg(inference, c("sampling", "EM"))
## check input data
if (!(all(rownames(A) == rownames(D)))) {
stop("Rownames for A and D are not identical.")
}
if (!(all(colnames(A) == colnames(D)))) {
stop("Colnames for A and D are not identical.")
}
if (is.null(Config) && is.null(n_clone)) {
stop("Config and n_clone can't be NULL together.")
}
if (any(A %% 1 != 0, na.rm = TRUE) ||
any(D %% 1 != 0, na.rm = TRUE)) {
stop("A and/or D contain non-integer values.")
}
## Cardelino-free mode if Config is NULL
if (is.null(Config)) {
cat("Config is NULL: de-novo mode is in use.\n")
Config <- matrix(0, nrow = nrow(D), ncol = n_clone)
rownames(Config) <- rownames(D)
colnames(Config) <- paste0("Clone", seq_len(n_clone))
relax_Config <- TRUE
relax_rate_fixed <- 0.5
}
## Match exome-seq and scRNA-seq data
if (!any(rownames(D) %in% rownames(Config))) {
stop("No matches in variant names between Config and D arguments.")
}
## match variants
common_vars <- intersect(rownames(Config), rownames(D))
A <- A[common_vars, , drop = FALSE]
D <- D[common_vars, , drop = FALSE]
Config <- Config[common_vars, , drop = FALSE]
if (verbose) {
message(length(common_vars), " variants used for cell assignment.")
}
## change sparse matrix to dense matrix
A <- as.matrix(A)
D <- as.matrix(D)
## pass data to specific functions
if (inference == "sampling") {
# Try to run sampling in parallel.
if (n_proc > 1) {
# doMC is not available on Windows so there we fall back to
# running the chains using only one processor.
if (!requireNamespace("doMC", quietly = TRUE)) {
message("Parallel sampling is not supported on Windows.")
message("Falling back to sampling using one processor.")
ids_list <- list(
clone_id_Gibbs(A, D, Config,
Psi = Psi,
relax_Config = relax_Config,
relax_rate_fixed = relax_rate_fixed,
verbose = verbose, ...
)
)
} else {
doMC::registerDoMC(n_proc)
`%dopar%` <- foreach::`%dopar%`
ids_list <- foreach::foreach(ii = seq_len(n_chain)) %dopar% {
clone_id_Gibbs(A, D, Config,
Psi = Psi,
relax_Config = relax_Config,
relax_rate_fixed = relax_rate_fixed,
verbose = verbose, ...
)
}
}
} else {
ids_list <- lapply(seq_len(n_chain), function(x)
clone_id_Gibbs(A, D, Config,
Psi = Psi,
relax_Config = relax_Config,
relax_rate_fixed = relax_rate_fixed,
verbose = verbose, ...
)
)
}
ids_out <- ids_list[[1]]
ids_out$n_chain <- 1
if (n_chain > 1) {
for (ii in seq(2, n_chain)) {
ids_out$n_chain <- ids_out$n_chain + 1
idx <- colMatch(ids_out$prob, ids_list[[ii]]$prob, force = TRUE)
ids_out$prob <- ids_out$prob + ids_list[[ii]]$prob[, idx]
ids_out$relax_rate <- ids_out$relax_rate +
ids_list[[ii]]$relax_rate
ids_out$Config_prob <- (ids_out$Config_prob +
ids_list[[ii]]$Config_prob[, idx])
}
ids_out$prob <- ids_out$prob / n_chain
ids_out$relax_rate <- ids_out$relax_rate / n_chain
ids_out$Config_prob <- ids_out$Config_prob / n_chain
}
return(ids_out)
}
else {
return(clone_id_EM(A, D, Config, verbose = verbose, ...))
}
}
#' Assign cells to clones from cardelino results
#'
#' @param prob_mat numeric matrix (cells x clones) of clone posterior
#' probabilities as output by \code{\link{clone_id}}
#' @param threshold numeric(1), posterior probability threshold for cell-clone
#' assignment: if posterior probability is above threshold, assign cell to
#' clone, otherwise leave cell "unassigned"
#'
#' @return a \code{data.frame} with cell ID, assigned clone label and maximum
#' posterior probability across clones.
#'
#' @author Davis McCarthy
#' @export
#' @examples
#' data(example_donor)
#' assignments <- clone_id(A_clone, D_clone, Config = tree$Z, inference = "EM")
#' df <- assign_cells_to_clones(assignments$prob)
#' head(df)
#' table(df$clone)
assign_cells_to_clones <- function(prob_mat, threshold = 0.5) {
assigns <- data.frame(
cell = rownames(prob_mat),
clone = rep("unassigned", nrow(prob_mat)),
prob_max = rep(NA, nrow(prob_mat)),
stringsAsFactors = FALSE
)
for (i in seq_len(nrow(prob_mat))) {
assigns[i, "prob_max"] <- max(prob_mat[i, ])
if (max(prob_mat[i, ]) > threshold) {
assigns[i, "clone"] <- names(which.max(prob_mat[i, ]))
}
}
assigns
}
#' Assign cells to clones using an EM algorithm
#'
#' @param logLik_threshold A float. The threshold of logLikelihood increase for
#' detecting convergence.
#'
#' @return a list containing \code{theta}, a vector of two floats denoting the
#' binomial rates given variant exists or not, \code{prob}, the matrix of
#' posterior probabilities of each cell belonging to each clone with fitted
#' parameters, and \code{logLik}, the log likelihood of the final parameters.
#'
#' @author Yuanhua Huang
#' @import stats
#' @rdname clone_id
#' @export
#'
clone_id_EM <- function(A, D, Config, Psi = NULL, min_iter = 10,
max_iter = 1000, logLik_threshold = 1e-5,
verbose = TRUE) {
if (is.null(Psi)) {
Psi <- rep(1 / ncol(Config), ncol(Config))
}
if (dim(A)[1] != dim(D)[1] || dim(A)[2] != dim(D)[2] ||
dim(A)[1] != dim(Config)[1] || dim(Config)[2] != length(Psi)) {
stop(
"A and D must have the same size;\n",
"A and Config must have the same number of variants;\n",
"Config and Psi must have the same number of clones"
)
}
## preprocessing
N <- dim(A)[1] # number of variants
M <- dim(A)[2] # number of cells
K <- dim(Config)[2] # number of clones
A[which(D == 0)] <- NA
D[which(D == 0)] <- NA
A[(D > 0) & is.na(A)] <- 0
C1 <- Config
C0 <- 1 - Config
A1 <- A # number of alteration reads
B1 <- D - A # number of reference reads
W_log <- sum(lchoose(D, A), na.rm = TRUE) # log binomial coefficients
A1[is.na(A1)] <- 0
B1[is.na(B1)] <- 0
S1 <- t(A1) %*% C0 # Alteration reads without variants (C=0): theta[1]
S2 <- t(B1) %*% C0 # Reference reads without variants (C=0): 1-theta[1]
S3 <- t(A1) %*% C1 # Alteration reads with variants (C=1): theta[2]
S4 <- t(B1) %*% C1 # Reference reads with variants (C=1): 1-theta[2]
## random initialization for EM
theta <- c(stats::runif(1, 0.001, 0.25), stats::runif(1, 0.3, 0.6))
logLik <- 0
logLik_new <- logLik + logLik_threshold * 2
## EM iterations
for (it in seq_len(max_iter)) {
# Check convergence
if ((it > min_iter) && ((logLik_new - logLik) < logLik_threshold)) {
break
}
logLik <- logLik_new
# E-step
logLik_mat <- (S1 * log(theta[1]) + S2 * log(1 - theta[1]) +
S3 * log(theta[2]) + S4 * log(1 - theta[2]))
logLik_mat <- t(t(logLik_mat) + log(Psi))
logLik_vec <- rep(NA, nrow(logLik_mat))
for (i in seq_len(nrow(logLik_mat))) {
logLik_vec[i] <- matrixStats::logSumExp(logLik_mat[i, ],
na.rm = TRUE
)
}
logLik_new <- sum(logLik_vec, na.rm = TRUE) + W_log
logLik_mat_amplify <- logLik_mat - matrixStats::rowMaxs(logLik_mat)
prob_mat <- exp(logLik_mat_amplify) / rowSums(exp(logLik_mat_amplify))
# M-step
if (is.na(sum(prob_mat * (S1 + S2))) ||
sum(prob_mat * (S1 + S2)) == 0) {
theta[1] <- 0.02
} else {
theta[1] <- sum(prob_mat * S1) / sum(prob_mat * (S1 + S2))
}
if (is.na(sum(prob_mat * (S3 + S4))) ||
sum(prob_mat * (S3 + S4)) == 0) {
theta[2] <- 0.75
} else {
theta[2] <- sum(prob_mat * S3) / sum(prob_mat * (S3 + S4))
}
}
if (verbose) {
print(paste("Total iterations:", it))
}
## return values
return_list <- list("theta" = theta, "prob" = prob_mat, "logLik" = logLik)
return_list
}
#' Assign cells to clones using a Gibbs sampling algorithm
#'
#' @param prior0 numeric(2), alpha and beta parameters for the Beta prior
#' distribution on the inferred false positive rate.
#' @param prior1 numeric(2), alpha and beta parameters for the Beta prior
#' distribution on the inferred (1 - false negative) rate.
#' @param relax_rate_prior numeric(2), the two parameters of beta prior
#' distribution of the relax rate for relaxing the clone Configuration. This
#' mode is used when relax_relax is NULL.
#' @param keep_base_clone bool(1), if TRUE, keep the base clone of Config to its
#' input values when relax mode is used.
#' @param buin_frac numeric(1), the fraction of chain as burn-in period
#' @param wise A string, the wise of parameters for theta1: global, variant,
#' element.
#' @param relabel bool(1), if TRUE, relabel the samples of both Config and prob
#' during the Gibbs sampling.
#'
#' @author Yuanhua Huang
#' @import matrixStats
#' @rdname clone_id
#' @export
#'
clone_id_Gibbs <- function(A, D, Config, Psi = NULL,
relax_Config = TRUE, relax_rate_fixed = NULL,
relax_rate_prior = c(1, 9), keep_base_clone = TRUE,
prior0 = c(0.2, 99.8), prior1 = c(0.45, 0.55),
min_iter = 5000, max_iter = 20000, buin_frac = 0.5,
wise = "variant", relabel = FALSE, verbose = TRUE) {
if (is.null(Psi)) {
Psi <- rep(1 / ncol(Config), ncol(Config))
}
if (dim(A)[1] != dim(D)[1] || dim(A)[2] != dim(D)[2] ||
dim(A)[1] != dim(Config)[1] || dim(Config)[2] != length(Psi)) {
stop(
"A and D must have the same size;\n ",
"A and Config must have the same number of variants;\n",
"Config and Psi must have the same number of clones"
)
}
if (sum(c("element", "variant", "global") == wise) == 0) {
stop(
"Input wise mode: ", wise,
", while only supporting: element, variant, global"
)
}
## preprocessing
N <- dim(A)[1] # number of variants
M <- dim(A)[2] # number of cells
K <- dim(Config)[2] # number of clones
A[which(D == 0)] <- NA
D[which(D == 0)] <- NA
A[(D > 0) & is.na(A)] <- 0
C1 <- Config
C0 <- 1 - Config
A1 <- A # number of alteration reads
B1 <- D - A # number of reference reads
W_log <- sum(lchoose(D, A), na.rm = TRUE) # log binomial coefficients
A1[is.na(A1)] <- 0
B1[is.na(B1)] <- 0
# reads number list for each clone
S1_list <- list()
S2_list <- list()
S3_list <- list()
S4_list <- list()
for (k in seq_len(K)) {
S1_list[[k]] <- A1 * C0[, k]
S2_list[[k]] <- B1 * C0[, k]
S3_list[[k]] <- A1 * C1[, k]
S4_list[[k]] <- B1 * C1[, k]
}
## Prepare for sampling
if (wise == "global") {
idx_vec <- seq_len(1) # For: theta1_all[t, ] <- theta1[idx_vec]
idx_mat <- seq_len(N * M) # For: theta1[idx_mat] <- theta1_all[t, ]
} else if (wise == "variant") {
idx_vec <- seq_len(N)
idx_mat <- seq_len(N * M)
} else if (wise == "element") {
idx_vec <- which(A1 + B1 > 0)
idx_mat <- which(A1 + B1 > 0)
}
n_element <- length(idx_vec)
if (is.null(dim(prior1)) && length(prior1) == 2) {
# two variable to a matrix
prior1 <- t(matrix(rep(prior1, n_element), nrow = 2))
}
if (!is.matrix(prior1)) {
stop("prior1 need to be a matrix of n_element x 2")
}
prob_all <- matrix(0, nrow = max_iter, ncol = M * K)
logLik_mat <- matrix(0, nrow = M, ncol = K)
logLik_all <- matrix(0, nrow = max_iter, ncol = 1)
assign_all <- matrix(0, nrow = max_iter, ncol = M)
theta0_all <- matrix(0, nrow = max_iter, ncol = 1)
theta1_all <- matrix(0, nrow = max_iter, ncol = n_element)
Config_all <- matrix(0, nrow = max_iter, ncol = N * K)
relax_rate_all <- matrix(0, nrow = max_iter, ncol = 1)
if (!is.null(relax_Config) && relax_Config != FALSE) {
if (!is.null(relax_rate_fixed)) {
if (relax_rate_fixed > 1 || relax_rate_fixed < 0) {
stop("relax_rate_fixed needs to be NULL or in [0, 1].")
}
relax_rate <- relax_rate_fixed ## fixed relax_rate
relax_rate_all[, ] <- relax_rate
} else if (!is.null(relax_rate_prior)) {
relax_rate <- relax_rate_prior[1] / (relax_rate_prior[1] +
relax_rate_prior[2])
} else {
stop("Require value for either relax_Config or relax_prior.")
}
Config_new <- Config
Config_prior <- Config
Config_prior[Config == 1] <- 1 - relax_rate
Config_prior[Config == 0] <- relax_rate
if (keep_base_clone) {
Config_prior[, 1] <- Config[, 1]
}
Config_prior_oddlog <- log(Config_prior) - log(1 - Config_prior)
Iden_mat <- matrix(0, nrow = M, ncol = K)
}
## Random initialization
theta0_all[1, 1] <- stats::rbeta(1, prior0[1], prior0[2])
theta1_all[1, ] <- stats::rbeta(rep(1, n_element), prior1[, 1], prior1[, 2])
theta1 <- matrix(NA, nrow = N, ncol = M)
## Gibbs sampling
for (it in 2:max_iter) {
# Update prob_mat
theta0 <- theta0_all[it - 1, 1]
theta1[idx_mat] <- theta1_all[it - 1, ]
for (k in seq_len(K)) {
logLik_mat[, k] <- (colSums(S1_list[[k]] * log(theta0),
na.rm = TRUE
) +
colSums(S2_list[[k]] * log(1 - theta0), na.rm = TRUE) +
colSums(S3_list[[k]] * log(theta1), na.rm = TRUE) +
colSums(S4_list[[k]] * log(1 - theta1), na.rm = TRUE))
logLik_mat[, k] <- logLik_mat[, k] + log(Psi[k])
}
logLik_mat_amplify <- logLik_mat - matrixStats::rowMaxs(logLik_mat)
prob_mat <- exp(logLik_mat_amplify) / rowSums(exp(logLik_mat_amplify))
prob_all[it, ] <- prob_mat
# logLik_vec <- rep(NA, nrow(logLik_mat))
# for (i in seq_len(nrow(logLik_mat))) {
# logLik_vec[i] <- matrixStats::logSumExp(logLik_mat[i,],
# na.rm = TRUE)
# }
# logLik_all[it] <- sum(logLik_vec, na.rm = TRUE) + W_log
# #Update logLikelihood (TODO: add W0 and W1)
# Sample assignment
for (j in seq_len(M)) {
assign_all[it, j] <- sample(seq_len(K), 1,
replace = TRUE,
prob = prob_mat[j, ]
)
}
## Update Config
if (!is.null(relax_Config) && relax_Config != FALSE) {
if (it > (0.1 * min_iter + 5) && is.null(relax_rate_fixed)) {
diff0 <- sum((Config == Config_new)[, 2:ncol(Config)])
diff1 <- sum((Config != Config_new)[, 2:ncol(Config)])
relax_rate <- stats::rbeta(
1, relax_rate_prior[1] + diff1,
relax_rate_prior[2] + diff0
)
relax_rate_all[it] <- relax_rate
Config_prior <- Config
Config_prior[Config == 1] <- 1 - relax_rate
Config_prior[Config == 0] <- relax_rate
if (keep_base_clone) {
Config_prior[, 1] <- Config[, 1]
}
Config_prior_oddlog <- log(Config_prior) - log(1 - Config_prior)
}
Iden_mat[, ] <- 0
for (j in seq_len(M)) {
Iden_mat[j, assign_all[it, j]] <- 1
}
# calculate log_probability matrix with genotype 0 and 1
P0_mat <- A1 * log(theta0) + B1 * log(1 - theta0) + W_log
P1_mat <- A1 * log(theta1) + B1 * log(1 - theta1) + W_log
oddR_log <- P1_mat %*% Iden_mat - P0_mat %*% Iden_mat
oddR_log <- oddR_log + Config_prior_oddlog
oddR_log[which(oddR_log > 50)] <- 50
oddR_log[which(oddR_log < -50)] <- -50
Config_prob_tmp <- exp(oddR_log) / (exp(oddR_log) + 1)
Config_new[, ] <- stats::rbinom(N * K, size = 1, Config_prob_tmp)
Config_all[it, ] <- Config_new
for (k in seq_len(K)) {
S1_list[[k]] <- A1 * (1 - Config_new[, k])
S2_list[[k]] <- B1 * (1 - Config_new[, k])
S3_list[[k]] <- A1 * Config_new[, k]
S4_list[[k]] <- B1 * Config_new[, k]
}
}
# Sample theta with assigned clones
S1_wgt <- S2_wgt <- 0 # weighted S1
S3_wgt <- S4_wgt <- matrix(0, nrow = N, ncol = M)
for (k in seq_len(K)) {
idx <- which(assign_all[it, ] == k)
S1_wgt <- S1_wgt + sum(S1_list[[k]][, idx], na.rm = TRUE)
S2_wgt <- S2_wgt + sum(S2_list[[k]][, idx], na.rm = TRUE)
S3_wgt[, idx] <- S3_wgt[, idx] + S3_list[[k]][, idx]
S4_wgt[, idx] <- S4_wgt[, idx] + S4_list[[k]][, idx]
}
if (wise == "global") {
S3_wgt[, ] <- sum(S3_wgt, na.rm = TRUE)
S4_wgt[, ] <- sum(S4_wgt, na.rm = TRUE)
} else if (wise == "variant") {
S3_wgt[, ] <- rowSums(S3_wgt, na.rm = TRUE)
S4_wgt[, ] <- rowSums(S4_wgt, na.rm = TRUE)
}
theta0_all[it, 1] <- stats::rbeta(
1, prior0[1] + S1_wgt,
prior0[2] + S2_wgt
)
theta1_all[it, ] <- stats::rbeta(
rep(1, n_element),
prior1[, 1] + S3_wgt[idx_vec],
prior1[, 2] + S4_wgt[idx_vec]
)
# Calculate logLikelihood
logLik_all[it] <- get_logLik(
A1, B1, Config_new, assign_all[it, ],
theta0, theta1
)
# Check convergence. TODO: consider relabel
if ((it >= min_iter) && (it %% 100 == 0)) {
Converged_all <- abs(Geweke_Z(prob_all[seq_len(it), ])) <= 2
# Converged_all <- abs(Geweke_Z(Config_all[1:it, ])) <= 2
if (verbose) {
cat(paste0(
round(mean(Converged_all, na.rm = TRUE), 3) * 100,
"% converged.\n"
))
}
if (mean(Converged_all, na.rm = TRUE) > 0.995) {
break
}
}
}
Converged_all <- abs(Geweke_Z(prob_all[seq_len(it), ])) <= 2
percent_converged <- round(mean(Converged_all, na.rm = TRUE), 3) * 100
if (percent_converged <= 95) {
warning("Only ", percent_converged, "% of parameters converged. ",
"Consider increasing `max_iter`.", call. = FALSE)
} else {
print(paste("Converged in", it, "iterations."))
}
## Return values
n_buin <- ceiling(it * buin_frac)
a <- A1[idx_mat]
d <- A1[idx_mat] + B1[idx_mat]
binom_pdf1 <- binom_pdf0 <- rep(0, n_element)
for (i in seq(n_buin, it)) {
binom_pdf1 <- binom_pdf1 + stats::dbinom(a,
size = d,
prob = theta1_all[i, ]
)
binom_pdf0 <- binom_pdf0 + stats::dbinom(a,
size = d,
prob = theta0_all[i]
)
}
prob_variant <- matrix(NA, nrow = N, ncol = M)
prob_variant[idx_mat] <- binom_pdf1 / (binom_pdf1 + binom_pdf0)
row.names(prob_variant) <- row.names(A)
colnames(prob_variant) <- colnames(A)
if (relabel) {
col_idx_use <- seq(K)
for (ii in seq(n_buin, it)) {
mat1 <- matrix(prob_all[ii - 1, ], nrow = M)
mat2 <- matrix(prob_all[ii, ], nrow = M)
# mat1 <- matrix(Config_all[ii - 1, ], nrow = N)
# mat2 <- matrix(Config_all[ii, ], nrow = N)
if (ncol(mat1) <= 5) {
idx <- colMatch(mat1, mat2, force = TRUE)
}
else {
idx <- colMatch(mat1, mat2, force = FALSE)
}
col_idx_use <- col_idx_use[idx]
prob_all[ii, ] <- matrix(prob_all[ii, ], nrow = M)[, col_idx_use]
Config_all[ii, ] <- matrix(Config_all[ii, ],
nrow = N
)[, col_idx_use]
}
}
prob_mat <- matrix(colMeans(prob_all[n_buin:it, ]), nrow = M)
row.names(prob_mat) <- colnames(A)
colnames(prob_mat) <- colnames(Config)
Config_prob <- Config
Config_prob[, ] <- colMeans(Config_all[n_buin:it, ])
theta0 <- mean(theta0_all[n_buin:it, ])
theta1[idx_mat] <- colMeans(as.matrix(theta1_all[n_buin:it, ]))
logLik_post <- get_logLik(A1, B1, Config_prob, prob_mat, theta0, theta1)
DIC <- devianceIC(logLik_all[n_buin:it], logLik_post)
# DIC <- devianceIC(logLik_all[n_buin:it],
# A1, B1, Config_prob, theta0, theta1, Psi, W_log)
return_list <- list(
"theta0" = theta0, "theta1" = theta1,
"theta0_all" = as.matrix(theta0_all[seq_len(it), ]),
"theta1_all" = as.matrix(theta1_all[seq_len(it), ]),
"element" = idx_mat, "logLik" = logLik_all[seq_len(it)],
"prob_all" = prob_all[seq_len(it), ],
"prob" = prob_mat, "prob_variant" = prob_variant,
"relax_rate" = mean(relax_rate_all[n_buin:it]),
"Config_prob" = Config_prob,
"Config_all" = Config_all[seq_len(it), ],
"relax_rate_all" = relax_rate_all[seq_len(it)], "DIC" = DIC
)
return_list
}
#' Geweke diagnostic for MCMC sampling.
#'
#' @param X A matrix of MCMC samples for N samples per K variables
#' @param first A float between 0 and 1. The initial region of MCMC chain.
#' @param last A float between 0 and 1. The final region of MCMC chain.
#'
#' @author Yuanhua Huang
#' @return \code{Z}, a vector of absolute value of Z scores for each variable.
#' When |Z| <= 2, the sampling could be taken as converged.
#'
Geweke_Z <- function(X, first = 0.1, last = 0.5) {
# The original Geweke’s diagnostics uses pectral densities to estimate
# the sample variances (Geweke,1992), for adjusting the samples
# as the two ‘samples’ are not independent
# so as the implementation in coda::geweke.diag()
# Converged_all = rep(NA, ncol(prob_all))
# for (ic in seq_len(length(Converged_all))) {
# Converged_all[ic] = abs(coda::geweke.diag(prob_all[, ic])$z) <= 2
# }
if (is.null(dim(X))) {
X <- as.matrix(X, ncol = 1)
}
N <- nrow(X)
A <- X[seq_len(floor(first * N)), , drop = FALSE]
B <- X[ceiling(last * N):N, , drop = FALSE]
A_col_mean <- colMeans(A)
B_col_mean <- colMeans(B)
A_col_var <- rowSums((t(A) - A_col_mean)^2) / (nrow(A) - 1) # ^2
B_col_var <- rowSums((t(B) - B_col_mean)^2) / (nrow(A) - 1) # ^2
min_var <- 10^(-50)
Z <- (A_col_mean - B_col_mean) / sqrt(A_col_var + B_col_var + min_var)
Z
}
#' Log likelihood of clone_id model
#' It returns P(A, D | C, I, theta0, theta1)
#'
#' @param A1 variant x cell matrix of integers; number of alternative allele
#' reads in variant i cell j
#' @param B1 variant x cell matrix of integers; number of reference allele
#' reads in variant i cell j
#' @param Config variant x clone matrix of float values. The clone-variant
#' configuration probability, averaged by posterior samples
#' @param Assign cells x clone matrix of float values. The cell-clone
#' assignment probability, averaged by posterior samples
#' @param theta0 the binomial rate for alternative allele from config = 0
#' @param theta1 the binomial rate for alternative allele from config = 1
#'
#' @author Yuanhua Huang
#' @return \code{logLik}, a float of log likelihood
#'
get_logLik <- function(A1, B1, Config, Assign, theta0, theta1) {
if (is.null(dim(Assign)) || length(dim(Assign)) == 1) {
Assign_prob <- matrix(0, length(Assign), ncol(Config))
for (i in seq_len(length(Assign))) {
Assign_prob[i, Assign[i]] <- 1
}
} else {
Assign_prob <- Assign
}
prob_mat <- Config %*% t(Assign_prob)
Lik_mat <- (exp(log(theta1) * A1 + log(1 - theta1) * B1) * prob_mat +
exp(log(theta0) * A1 + log(1 - theta0) * B1) * (1 - prob_mat))
logLik <- (sum(log(Lik_mat), na.rm = TRUE) +
sum(lchoose(A1 + B1, A1), na.rm = TRUE))
logLik
}
#' Deviance Information Criterion for cardelino model
#'
#' @param logLik_all A vector of numeric; the log likelihood of posterior
#' sample, i.e., posterior samples of deviance
#' @param logLik_post numeric(1); the log likelihood of mean posterior
#' parameters, i.e., deviance of posterior means
#'
#' @author Yuanhua Huang
#' @return \code{DIC}, a float of deviance information criterion
#'
devianceIC <- function(logLik_all, logLik_post) {
# https://www.mrc-bsu.cam.ac.uk/wp-content/uploads/DIC-slides.pdf
# Spiegelhalter et al. The deviance information criterion: 12 years on, 2014
# Spiegelhalter et al. Bayesian measures of model complexity and fit, 2002
# Gelman et al. Bayesian Data Analysis. 3rd Edition, 2013
# (Charpter 7.2, p173)
logLik_mean <- mean(logLik_all)
logLik_var <- var(logLik_all)
p_D_Spiegelhalter <- -2 * logLik_mean - (-2 * logLik_post)
DIC_Spiegelhalter <- -2 * logLik_post + 2 * p_D_Spiegelhalter
p_D_Gelman <- 2 * logLik_var
DIC_Gelman <- -2 * logLik_post + 2 * p_D_Gelman
DIC <- DIC_Gelman
cat(paste(
"DIC:", round(DIC, 2),
"D_mean:", round(-2 * logLik_mean, 2),
"D_post:", round(-2 * logLik_post, 2),
"logLik_var:", round(logLik_var, 2), "\n"
))
list(
"DIC" = DIC,
"logLik_var" = logLik_var,
"D_mean" = -2 * logLik_mean,
"D_post" = -2 * logLik_post,
"DIC_Gelman" = DIC_Gelman,
"DIC_Spiegelhalter" = DIC_Spiegelhalter
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.