Nothing
# Hidden Markov Model simulation with Gamma-Poisson data
# ******************************************************************************************************-----
# General Helpers: ------------------------------------------------------------------------------------------
# ******************************************************************************************************-----
#' Calculate a Kullback-Leibler Divergence for a Discrete Distribution
#'
#' @param p (numeric) probabilities
#'
#' @param q (numeric) probabilities
#'
#' @details
#' The Kullback-Leibler divergence for a discrete distribution
#' is calculated as follows:
#'
#' \deqn{\sum_{i=1}^n p_i \log\Big(\frac{p_i}{q_i}\Big)}
#'
#' @return
#' Numeric vector
#'
#' @export
#'
#' @examples
#' # Simulate n Poisson distributed variates
#' n <- 1000
#' dist1 <- rpois(n, lambda = 1)
#' dist2 <- rpois(n, lambda = 5)
#' dist3 <- rpois(n, lambda = 20)
#'
#' # Generate common factor levels
#' x_max <- max(c(dist1, dist2, dist3))
#' all_levels <- 0:x_max
#'
#' # Estimate probability mass functions
#' pmf_dist1 <- table(factor(dist1, levels = all_levels)) / n
#' pmf_dist2 <- table(factor(dist2, levels = all_levels)) / n
#' pmf_dist3 <- table(factor(dist3, levels = all_levels)) / n
#'
#' # Visualise PMFs
#' barplot(pmf_dist1, col = "green", xlim = c(0, x_max))
#' barplot(pmf_dist2, col = "red", add = TRUE)
#' barplot(pmf_dist3, col = "blue", add = TRUE)
#'
#' # Calculate distances
#' kullback_leibler_disc(pmf_dist1, pmf_dist2)
#' kullback_leibler_disc(pmf_dist1, pmf_dist3)
#' kullback_leibler_disc(pmf_dist2, pmf_dist3)
kullback_leibler_disc <- function(p, q) {
if (!is.numeric(p)) {
stop("kullback_leibler_disc(): `p` must be numeric", call. = FALSE)
}
if (!is.numeric(q)) {
stop("kullback_leibler_disc(): `q` must be numeric", call. = FALSE)
}
if (any(p < 0) | any(p > 1)) {
warning("kullback_leibler_disc(): `p` must be between 0 and 1", call. = FALSE)
}
if (any(q < 0) | any(q > 1)) {
warning("kullback_leibler_disc(): `q` must be between 0 and 1", call. = FALSE)
}
if (length(p) != length(q)) {
stop("kullback_leibler_disc(): `p` must be the same length as `q`", call. = FALSE)
}
p <- p + 1e-12
q <- q + 1e-12
sum(p * log(p / q))
}
# ******************************************************************************************************-----
# Gamma-Poisson Specific Functions : ------------------------------------------------------------------------------------------
# ******************************************************************************************************-----
#' Simulate data distributed according to oHMMed with gamma-poisson emission densities
#'
#' @param L (integer) number of simulations
#'
#' @param mat_T (matrix) a square matrix with the initial state
#'
#' @param betas (numeric) \code{rate} parameter in \code{\link{rgamma}} for emission probabilities
#'
#' @param alpha (numeric) \code{shape} parameter in \code{\link{rgamma}} for emission probabilities
#'
#' @return
#' Returns a list with the following elements:
#' \itemize{
#' \item \code{data}: numeric vector with data
#' \item \code{states}: an integer vector with "true" hidden states used to generate the data vector
#' \item \code{pi}: numeric vector with prior probability of states
#' }
#'
#' @export
#'
#' @examples
#' mat_T <- rbind(c(1-0.01, 0.01, 0),
#' c(0.01, 1-0.02, 0.01),
#' c(0, 0.01, 1-0.01))
#' L <- 2^7
#' betas <- c(0.1, 0.3, 0.5)
#' alpha <- 1
#'
#' sim_data <- hmm_simulate_gamma_poisson_data(L = L,
#' mat_T = mat_T,
#' betas = betas,
#' alpha = alpha)
#' hist(sim_data$data,
#' breaks = 40,
#' main = "Histogram of Simulated Gamma-Poisson Data",
#' xlab = "")
#' sim_data
hmm_simulate_gamma_poisson_data <- function(L, mat_T, betas, alpha) {
if (length(alpha) != 1) {
stop("hmm_simulate_gamma_poisson_data(): shape parameter `alpha` must be of length 1", call. = FALSE)
}
if (alpha < 0) {
stop("hmm_simulate_gamma_poisson_data(): shape parameter `alpha` must be non-negative", call. = FALSE)
}
if (any(betas < 0)) {
stop("hmm_simulate_gamma_poisson_data(): rate parameters `betas` must be non-negative", call. = FALSE)
}
if (!is.matrix(mat_T)) {
stop("hmm_simulate_gamma_poisson_data(): `mat_T` must be a numeric matrix", call. = FALSE)
}
if (any(is_row_sum_one_(mat_T) == FALSE)) {
stop("hmm_simulate_gamma_poisson_data(): rows in the transition matrix `mat_T` must sum up to 1", call. = FALSE)
}
if (length(L) != 1) {
stop("hmm_simulate_gamma_poisson_data(): the number of simulations `L` must be a single integer", call. = FALSE)
}
vstate <- rep(NA, L)
vemit <- rep(NA, L)
n_states <- nrow(mat_T)
pi <- get_pi_(mat_T)
vstate[1] <- sample.int(n = n_states, size = 1, replace = TRUE, prob = pi)
vemit[1] <- stats::rpois(1, stats::rgamma(1, shape = alpha, rate = betas[vstate[1]]))
for(i in 2:L) {
p_state <- mat_T[vstate[i - 1], ]
vstate[i] <- sample.int(n = n_states, size = 1, replace = TRUE, prob = p_state)
vemit[i] <- stats::rpois(1, stats::rgamma(1, shape = alpha, rate = betas[vstate[i]]))
}
list("data" = vemit,
"states" = factor(vstate, levels = 1:n_states),
"pi" = pi)
}
#' Forward-Backward Algorithm to Calculate the Posterior Probabilities of Hidden States in Poisson-Gamma Model
#'
#' Forward-Backward Algorithm to Calculate the Posterior Probabilities of Hidden States in Poisson-Gamma Model
#'
#' @param data (numeric) Poisson data
#'
#' @param pi (numeric) prior probability of states
#'
#' @param mat_T (matrix) transition probability matrix
#'
#' @param betas (numeric) vector with prior rates
#'
#' @param alpha (numeric) prior scale
#'
#' @details
#' Please see \href{https://static-content.springer.com/esm/art\%3A10.1186\%2Fs12859-024-05751-4/MediaObjects/12859_2024_5751_MOESM1_ESM.pdf}{supplementary information} at \doi{10.1186/s12859-024-05751-4} for more details on the algorithm.
#'
#' @return
#' List with the following elements:
#' \itemize{
#' \item \code{F}: auxiliary forward variables
#' \item \code{B}: auxiliary backward variables
#' \item \code{s}: weights
#' }
#'
#' @export
#'
#' @examples
#' mat_T <- rbind(c(1-0.01,0.01,0),
#' c(0.01,1-0.02,0.01),
#' c(0,0.01,1-0.01))
#' L <- 2^10
#' betas <- c(0.1, 0.3, 0.5)
#' alpha <- 1
#'
#' sim_data <- hmm_simulate_gamma_poisson_data(L = L,
#' mat_T = mat_T,
#' betas = betas,
#' alpha = alpha)
#' pi <- sim_data$pi
#' hmm_poison_data <- sim_data$data
#' hist(hmm_poison_data, breaks = 100)
#'
#' # Calculate posterior probabilities of hidden states
#' post_prob <- posterior_prob_gamma_poisson(data = hmm_poison_data,
#' pi = pi,
#' mat_T = mat_T,
#' betas = betas,
#' alpha = alpha)
#' str(post_prob)
posterior_prob_gamma_poisson <- function(data, pi, mat_T, betas, alpha) {
cap_ <- .Machine$double.xmax
floor_ <- .Machine$double.xmin
L <- length(data)
n_states <- nrow(mat_T)
mat_F <- matrix(ncol = n_states, nrow = L)
vs <- rep(1, L)
pi[pi < floor_] <- floor_
nb_prob <- betas / (1 + betas)
vtemp <- cap_floor_(pi * stats::dnbinom(x = data[1], prob = nb_prob, size = alpha), cap_, floor_)
vs[1] <- sum(vtemp)
mat_F[1,] <- vtemp / vs[1]
# Forward
for (l in 2:L) {
probs <- cap_floor_(stats::dnbinom(x = data[l], prob = nb_prob, size = alpha), cap_, floor_)
vtemp <- mat_F[l - 1,] %*% mat_T * probs
vs[l] <- sum(vtemp)
mat_F[l, ] <- vtemp / vs[l]
}
# Backward
mat_B <- matrix(nrow = L, ncol = n_states)
mat_B[L, ] <- rep(1, n_states) / vs[L]
for (l in L:2) {
probs <- cap_floor_(stats::dnbinom(x = data[l], prob = nb_prob, size = alpha), cap_, floor_)
mat_B[l - 1, ] <- mat_T %*% (probs * mat_B[l, ]) / vs[l - 1]
}
list(F = mat_F, B = mat_B, s = vs)
}
#' @keywords internal
sample_states_pois_ <- function(mat_R, mat_T, pi, betas, alpha, data) {
L <- length(mat_R$F[ ,1])
n_states <- length(mat_R$F[1, ])
states <- rep(NA, L)
cap_ <- .Machine$double.xmax
floor_ <- .Machine$double.xmin
p <- cap_floor_(mat_R$F[1, ] * mat_R$F[1, ] * mat_R$s[1], cap_, floor_)
p <- p / sum(p)
p[is.na(p)] <- floor_
all_runif <- stats::runif(L)
states[1] <- which.max(cumsum(p) - all_runif[1] > 0)
nb_prob <- betas / (1 + betas)
probs <- matrix(NA, nrow = L, ncol = length(betas))
for (i in 1:L) {
probs[i, ] <- nb_prob^alpha * (1 - nb_prob)^data[i]
}
probs <- cap_floor_(probs, cap_, floor_)
for (l in 2:L) {
p <- mat_T[states[l - 1], ] * probs[l, ] * mat_R$B[l, ]
p <- p / sum(p)
p[is.na(p)] <- floor_
states[l] <- which.max(cumsum(p) - all_runif[l] > 0)
}
factor(states, levels = 1:n_states)
}
#' @keywords internal
sample_betas_alpha_ <- function(cur_betas, cur_alpha, prior_betas, prior_alpha, states = NULL, data = NULL) {
N <- length(data)
betas <- NULL
alpha <- NULL
n_states <- length(cur_betas)
loglambda_p <- prior_alpha
lvl <- list()
for (i in 1:n_states) {
vl <- data[states == i]
N_i <- length(vl)
vl <- stats::rgamma(N_i, shape = cur_alpha + vl, rate = cur_betas[i] + 1)
lvl[[i]] <- vl
rate_gamma <- 1
post_shape <- cur_alpha * N_i + 1
post_rate <- sum(lvl[[i]]) + 1 / prior_betas[i]
betas[i] <- stats::rgamma(1, shape = post_shape, rate = post_rate)
}
betas <- sort(betas, decreasing = TRUE)
for (i in 1:n_states) {
if (N > 0) {
loglambda_p <- loglambda_p + sum(log(betas[i] * lvl[[i]]))
}
}
HM <- exp(loglambda_p / (N + 1))
shape_param <- (N + prior_alpha) * cur_alpha
alpha_star <- stats::rgamma(1, shape = shape_param, rate = N)
lh_star <- 0
lh_old <- 0
for (i in 1:n_states) {
lh_star <- lh_star + (log(HM) * alpha_star - lgamma(alpha_star)) * N
lh_old <- lh_old + (log(HM) * cur_alpha - lgamma(cur_alpha)) * N
}
d1 <- stats::dgamma(alpha_star, shape = shape_param, rate = N, log = TRUE)
d2 <- stats::dgamma(cur_alpha, shape = shape_param, rate = N, log = TRUE)
lr <- lh_star - d1 - (lh_old - d2)
alpha <- cur_alpha
if (stats::runif(1) <= exp(lr)) {
alpha <- alpha_star
}
list(betas = betas, alpha = alpha)
}
#' @keywords internal
init_hmm_mcmc_gamma_poisson_ <- function(data, prior_T, prior_betas, prior_alpha,
init_T, init_betas, init_alpha,
verbose, iter, warmup, thin, chain_id = NULL) {
if (iter < 2) {
stop("hmm_mcmc_gamma_poisson(): `iter` needs to be bigger than 1", call. = FALSE)
}
if (!is.integer(data)) {
stop("hmm_mcmc_gamma_poisson(): `data` needs to be an integer vector", call. = FALSE)
}
if (sum(is.na(data)) > 0) {
stop("hmm_mcmc_gamma_poisson(): `data` contains missing values", call. = FALSE)
}
if (any(is_row_sum_one_(prior_T) == FALSE)) {
stop("hmm_mcmc_gamma_poisson(): rows in the transition matrix `prior_T` must sum up to 1", call. = FALSE)
}
if (length(prior_betas) != nrow(prior_T) | length(prior_betas) != ncol(prior_T)) {
stop("hmm_mcmc_gamma_poisson(): number of states is not the same between input variables", call. = FALSE)
}
if (length(prior_alpha) != 1) {
stop("hmm_mcmc_gamma_poisson(): `prior_alpha` must be of length 1", call. = FALSE)
}
if (warmup >= iter + 2) {
stop("hmm_mcmc_gamma_poisson(): `warmup` must be lower than `iter`", call. = FALSE)
}
if (thin > iter - warmup) {
stop("hmm_mcmc_gamma_poisson(): `thin` cannot exceed iterations after warmup period", call. = FALSE)
}
# Initialization
if (is.null(init_betas)) {
init_betas <- stats::rgamma(length(prior_betas), shape = 1, rate = 1 / prior_betas)
}
if (is.null(init_alpha)) {
init_alpha <- stats::rgamma(1, shape = prior_alpha, rate = 1)
}
if (is.null(init_T)) {
init_T <- sample_T_(prior_mat = prior_T)
}
if (length(init_betas) != nrow(init_T) | length(init_betas) != ncol(init_T)) {
stop("hmm_mcmc_gamma_poisson(): number of states is not the same between input variables", call. = FALSE)
}
if (length(init_alpha) != 1) {
stop("hmm_mcmc_gamma_poisson(): `init_alpha` must be of length 1", call. = FALSE)
}
if (any(is_row_sum_one_(init_T) == FALSE)) {
stop("hmm_mcmc_gamma_poisson(): rows in the transition matrix `init_T` must sum up to 1", call. = FALSE)
}
lambda1 <- sum(data) / length(data)
lambda2 <- numeric(length(init_betas))
for (i in 1:length(init_betas)) {
lambda2[i] <- mean(stats::rgamma(1000, init_alpha, init_betas[i]))
}
chain_char <- NULL
if (!is.null(chain_id)) {
chain_char <- paste0("(chain ", chain_id, ") ")
}
if (verbose) {
set_alpha <- (mean(data)^2) / ((stats::var(data) - mean(data)) / length(init_betas))
if( (init_alpha > (set_alpha + 3)) | (init_alpha < (set_alpha - 3)) ){
message("hmm_mcmc_gamma_poisson(): ", chain_char, "initial alpha is not close to the recommended value")
}
if (any(lambda2 > stats::var(data))) {
message("hmm_mcmc_gamma_poisson(): ", chain_char, "at least one initial rate parameter is greater than the overall variance")
}
}
init_pi <- get_pi_(init_T)
init_mat_res <- posterior_prob_gamma_poisson(data = data,
pi = init_pi,
mat_T = init_T,
betas = init_betas,
alpha = init_alpha)
init_states <- sample_states_pois_(mat_R = init_mat_res,
pi = init_pi,
mat_T = init_T,
betas = init_betas,
alpha = init_alpha,
data = data)
list(init_states = init_states, init_betas = init_betas,
init_alpha = init_alpha, init_T = init_T, init_pi = init_pi,
init_mat_res = init_mat_res)
}
#' MCMC Sampler sampler for the Hidden Markov with Gamma-Poisson emission densities
#'
#' @param data (numeric) data
#'
#' @param prior_T (matrix) prior transition matrix
#'
#' @param prior_betas (numeric) prior beta parameters
#'
#' @param prior_alpha (numeric) a single prior alpha parameter. By default, \code{prior_alpha=1}
#'
#' @param iter (integer) number of MCMC iterations
#'
#' @param warmup (integer) number of warmup iterations
#'
#' @param thin (integer) thinning parameter. By default, \code{1}
#'
#' @param seed (integer) seed parameter
#'
#' @param init_T (matrix) \code{optional parameter}; initial transition matrix
#'
#' @param init_betas (numeric) \code{optional parameter}; initial beta parameters
#'
#' @param init_alpha (numeric) \code{optional parameter}; initial alpha parameter
#'
#' @param print_params (logical) \code{optional parameter}; print estimated parameters every iteration. By default, \code{TRUE}
#'
#' @param verbose (logical) \code{optional parameter}; print additional messages. By default, \code{TRUE}
#'
#' @details
#' Please see \href{https://static-content.springer.com/esm/art\%3A10.1186\%2Fs12859-024-05751-4/MediaObjects/12859_2024_5751_MOESM1_ESM.pdf}{supplementary information} at \doi{10.1186/s12859-024-05751-4} for more details on the algorithm.
#'
#' For usage recommendations please see \url{https://github.com/LynetteCaitlin/oHMMed/blob/main/UsageRecommendations.pdf}.
#'
#' @references
#' Claus Vogl, Mariia Karapetiants, Burçin Yıldırım, Hrönn Kjartansdóttir, Carolin Kosiol, Juraj Bergman, Michal Majka, Lynette Caitlin Mikula.
#' Inference of genomic landscapes using ordered Hidden Markov Models with emission densities (oHMMed).
#' BMC Bioinformatics 25, 151 (2024). \doi{10.1186/s12859-024-05751-4}
#'
#' @return
#' List with following elements:
#' \itemize{
#' \item \code{data}: data used for simulation
#' \item \code{samples}: list with samples
#' \item \code{estimates}: list with various estimates
#' \item \code{idx}: indices with iterations after the warmup period
#' \item \code{priors}: prior parameters
#' \item \code{inits}: initial parameters
#' \item \code{last_iter}: list with samples from the last MCMC iteration
#' \item \code{info}: list with various meta information about the object
#' }
#'
#' @export
#'
#' @examples
#' # Simulate Poisson-Gamma data
#' N <- 2^10
#' true_T <- rbind(c(0.95, 0.05, 0),
#' c(0.025, 0.95, 0.025),
#' c(0.0, 0.05, 0.95))
#'
#' true_betas <- c(2, 1, 0.1)
#' true_alpha <- 1
#'
#' simdata_full <- hmm_simulate_gamma_poisson_data(L = N,
#' mat_T = true_T,
#' betas = true_betas,
#' alpha = true_alpha)
#' simdata <- simdata_full$data
#' hist(simdata, breaks = 40, probability = TRUE,
#' main = "Distribution of the simulated Poisson-Gamma data")
#' lines(density(simdata), col = "red")
#'
#' # Set numbers of states to be inferred
#' n_states_inferred <- 3
#'
#' # Set priors
#' prior_T <- generate_random_T(n_states_inferred)
#' prior_betas <- c(1, 0.5, 0.1)
#' prior_alpha <- 3
#'
#' # Simmulation settings
#' iter <- 50
#' warmup <- floor(iter / 5) # 20 percent
#' thin <- 1
#' seed <- sample.int(10000, 1)
#' print_params <- FALSE # if TRUE then parameters are printed in each iteration
#' verbose <- FALSE # if TRUE then the state of the simulation is printed
#'
#' # Run MCMC sampler
#' res <- hmm_mcmc_gamma_poisson(data = simdata,
#' prior_T = prior_T,
#' prior_betas = prior_betas,
#' prior_alpha = prior_alpha,
#' iter = iter,
#' warmup = warmup,
#' thin = thin,
#' seed = seed,
#' print_params = print_params,
#' verbose = verbose)
#' res
#'
#' summary(res)# summary output can be also assigned to a variable
#'
#' coef(res) # extract model estimates
#'
#' # plot(res) # MCMC diagnostics
hmm_mcmc_gamma_poisson <- function(data,
prior_T,
prior_betas,
prior_alpha = 1,
iter = 5000,
warmup = floor(iter / 1.5),
thin = 1,
seed = sample.int(.Machine$integer.max, 1),
init_T = NULL,
init_betas = NULL,
init_alpha = NULL,
print_params = TRUE,
verbose = TRUE) {
set.seed(seed)
init_data <- init_hmm_mcmc_gamma_poisson_(data, prior_T, prior_betas, prior_alpha,
init_T, init_betas, init_alpha,
verbose, iter, warmup, thin)
n_data <- length(data)
n_states <- length(prior_betas)
mean_states <- matrix(0, ncol = n_states, nrow = n_data)
all_betas <- matrix(0, ncol = n_states, nrow = iter)
all_alpha <- numeric(iter)
all_mat_T <- array(dim = c(dim(prior_T), iter))
vlh <- numeric(iter)
all_betas[1, ] <- init_data$init_betas
all_alpha[1] <- init_data$init_alpha
all_mat_T[, ,1] <- init_data$init_T
vlh[1] <- sum(log(init_data$init_mat_res$s))
states <- init_data$init_states
betas <- init_data$init_betas
alpha <- init_data$init_alpha
# Run sampler
for (it in 2:iter) {
print_progress_(it, iter, verbose = verbose)
m <- sample_betas_alpha_(cur_betas = betas,
cur_alpha = alpha,
prior_betas = prior_betas,
prior_alpha = prior_alpha,
states = states,
data = data)
betas <- m$betas
alpha <- m$alpha
mat_T <- sample_T_(states, prior_mat = prior_T)
pi <- get_pi_(mat_T)
mat_res <- posterior_prob_gamma_poisson(data = data,
pi = pi,
mat_T = mat_T,
betas = betas,
alpha = alpha)
states <- sample_states_pois_(mat_R = mat_res,
pi = pi,
mat_T = mat_T,
betas = betas,
alpha = alpha,
data = data)
if (iter > warmup) {
for (i in 1:n_states) {
mean_states[ ,i] <- mean_states[ ,i] + (states == i)
}
}
all_betas[it, ] <- betas
all_alpha[it] <- alpha
all_mat_T[ , ,it] <- mat_T
vlh[it] <- sum(log(mat_res$s))
if (print_params) {
print(c(colMeans(all_betas[1:it, ]), mean(all_alpha[1:it])))
}
}
# Prepare outputs
idx <- seq.int(warmup + 1, by = thin, to = iter)
colnames(all_betas) <- paste0("beta[", 1:n_states,"]")
all_means <- all_alpha / all_betas
colnames(all_means) <- paste0("means[", 1:n_states,"]")
samples <- list(betas = all_betas,
alpha = all_alpha,
means = all_means,
mat_T = all_mat_T,
vlh = vlh)
posterior_states <- factor(max.col(mean_states, "first"), 1:n_states)
estimates <- list(betas = colMeans(all_betas[idx, ]),
alpha = mean(all_alpha[idx]),
means = colMeans(all_means[idx, ]),
mat_T = apply(all_mat_T[ , ,idx], c(1,2), mean),
posterior_states = posterior_states,
posterior_states_prob = mean_states / iter,
log_likelihood = vlh[idx])
priors <- list(prior_betas = prior_betas,
prior_alpha = prior_alpha,
prior_T = prior_T)
inits <- list(init_states = init_data$init_states,
init_betas = init_data$init_betas,
init_alpha = init_data$init_alpha,
init_T = init_data$init_T)
last_iter <- list(betas = all_betas[iter, ],
alpha = all_alpha[iter],
means = all_means[iter, ],
mat_T = all_mat_T[ , ,iter])
info <- list(model_name = "hmm_mcmc_gamma_poisson",
date = as.character(Sys.time()),
seed = seed,
iter = iter,
warmup = warmup,
thin = thin,
n_states = length(prior_betas))
res <- list(data = data,
samples = samples,
estimates = estimates,
idx = idx,
priors = priors,
inits = inits,
last_iter = last_iter,
info = info)
class(res) <- "hmm_mcmc_gamma_poisson"
res$info$object_size <- format(utils::object.size(res), "Mb")
res
}
#' @keywords internal
#' @export
print.hmm_mcmc_gamma_poisson <- function(x, ...) {
info <- x$info
cat("Model:", "HMM Gamma-Poisson", "\n")
cat("Type:", "MCMC", "\n")
cat("Iter:", info$iter, "\n")
cat("Warmup:", info$warmup, "\n")
cat("Thin:", info$thin, "\n")
cat("States:", length(x$priors$prior_betas), "\n")
}
#' @keywords internal
#' @export
summary.hmm_mcmc_gamma_poisson <- function(object, ...) {
info <- object$info
idx <- object$idx
data <- object$data
beta_est <- object$estimates$betas
alpha_est <- object$estimates$alpha
T_est <- object$estimates$mat_T
means_est <- object$estimates$means
post_states <- object$estimates$posterior_states
state_tab <- table(post_states, dnn = "")
kl_list <- rep(NA, 500)
for (j in 1:500) {
sim_output <- unlist(lapply(1:length(state_tab), function(i) {
stats::rnbinom(state_tab[i],
size = alpha_est,
prob = beta_est[i] / (1 + beta_est[i]))
}))
jth_levels <- 0:max(c(data, sim_output))
dens_data_table <- table(factor(data, levels = jth_levels))
dens_sim_table <- table(factor(sim_output, levels = jth_levels))
dens_data <- dens_data_table / sum(dens_data_table)
dens_sim <- dens_sim_table / sum(dens_sim_table)
kl_list[j] <- kullback_leibler_disc(dens_data, dens_sim)
}
kl_div <- mean(kl_list)
ll_info <- c(mean(object$estimates$log_likelihood),
stats::sd(object$estimates$log_likelihood),
stats::median(object$estimates$log_likelihood))
names(ll_info) <- c("mean", "sd", "median")
group_comparison <- rep(NA, length(beta_est) - 1)
for (k in 1:(length(beta_est) - 1)) {
gr1 <- object$data[object$estimates$posterior_states == k]
gr2 <- object$data[object$estimates$posterior_states == (k + 1)]
names(group_comparison)[k] <- paste0(k, "-", k + 1)
if (length(gr1) == 0 | length(gr2) == 0) {
next
}
group_comparison[k] <- stats::poisson.test(x = c(sum(gr1), sum(gr2)),
T = c(length(gr1), length(gr2)),
alternative = "l")$p.value
}
summary_res <- list("estimated_betas" = beta_est,
"estimated_alpha" = alpha_est,
"estimated_means" = means_est,
"estimated_transition_rates" = T_est,
"assigned_states" = state_tab,
"approximate_kullback_leibler_divergence" = kl_div,
"log_likelihood" = ll_info,
"state_differences_significance" = group_comparison)
cat("Estimated betas:\n")
print(summary_res$estimated_betas)
cat("\n")
if (stats::sd(object$samples$alpha) > 0) {
cat("Estimated alpha:\n")
cat(summary_res$estimated_alpha)
cat("\n")
cat("\n")
}
cat("Estimated means:\n")
cat(summary_res$estimated_means)
cat("\n")
cat("\n")
cat("Estimated transition rates:\n")
etr <- summary_res$estimated_transition_rates
rownames(etr) <- colnames(etr) <- names(summary_res$assigned_states)
print(etr)
cat("\n")
cat("Number of windows assigned to hidden states:\n")
as <- summary_res$assigned_states
as_names <- attributes(as)$dimnames[[1]]
print(stats::setNames(as.numeric(as), as_names))
cat("\n")
cat("Kullback-Leibler divergence between observed and estimated distributions:\n")
cat(stats::setNames(summary_res$approximate_kullback_leibler_divergence, ""))
cat("\n")
cat("\n")
cat("Log Likelihood:\n")
print(summary_res$log_likelihood)
cat("\n")
cat("P-value of poisson test for difference between rates of states (stepwise):\n")
print(summary_res$state_differences_significance)
cat("\n")
invisible(summary_res)
}
#' @rdname coef.hmm_mcmc_normal
#'
#' @export
#' @export coef.hmm_mcmc_gamma_poisson
#'
#' @examples
#' coef(example_hmm_mcmc_gamma_poisson)
coef.hmm_mcmc_gamma_poisson <- function(object, ...) {
est <- object$estimates
list(betas = est$betas,
alpha = est$alpha,
means = est$means,
mat_T = est$mat_T)
}
#' Plot Diagnostics for \code{hmm_mcmc_gamma_poisson} Objects
#'
#' This function creates a variety of diagnostic plots that can be useful when
#' conducting Markov Chain Monte Carlo (MCMC) simulation of a gamma-poisson hidden Markov model (HMM).
#' These plots will help to assess convergence, fit, and performance of the MCMC simulation
#'
#' @param x (hmm_mcmc_gamma_poisson) HMM MCMC gamma-poisson object
#'
#' @param simulation (logical); default is \code{simulation=FALSE}, so the input data was empirical. If the input data was simulated, it must be set \code{simulation=TRUE}.
#'
#' @param true_betas (numeric) true betas. To be used if \code{simulation=TRUE}
#'
#' @param true_alpha (numeric) true alpha. To be used if \code{simulation=TRUE}
#'
#' @param true_mat_T (matrix) \code{optional parameter}; true transition matrix. To be used if \code{simulation=TRUE}
#'
#' @param true_states (integer) \code{optional parameter}; true states. To be used if \code{simulation=TRUE}
#'
#' @param show_titles (logical) if \code{TRUE} then titles are shown for all graphs. By default, \code{TRUE}
#'
#' @param log_statesplot (logical) if \code{TRUE} then log-statesplots are shown. By default, \code{FALSE}
#'
#' @param ... not used
#'
#' @return
#' Several diagnostic plots that can be used to evaluate the MCMC simulation
#' of the gamma-poisson HMM
#'
#' @export
#' @export plot.hmm_mcmc_gamma_poisson
#'
#' @examples
#' \donttest{
#' plot(example_hmm_mcmc_gamma_poisson)
#' }
plot.hmm_mcmc_gamma_poisson <- function(x,
simulation = FALSE,
true_betas = NULL,
true_alpha = NULL,
true_mat_T = NULL,
true_states = NULL,
show_titles = TRUE,
log_statesplot = FALSE,
...) {
info <- x$info
data <- x$data
idx <- x$idx
if (simulation) {
cond <- any(c(is.null(true_betas), is.null(true_alpha), is.null(true_mat_T),
is.null(true_states)))
if (cond) {
stop("plot.hmm_mcmc_gamma_poisson(): if `simulation=TRUE` then `true_betas` ",
"`true_alpha`, `true_mat_T` and `true_states` must be defined", call. = FALSE)
}
}
# Diagnostics betas
all_betas <- convert_to_ggmcmc(x, pattern = "beta")
n_betas <- attributes(all_betas)$nParameters
facet_means <- ggplot2::facet_wrap(~ Parameter, ncol = floor(n_betas / 2), scales = "free")
mdens <- ggmcmc::ggs_density(all_betas) +
facet_means +
ggplot2::labs(x = "Value",
y = "Density",
title = if (show_titles) "Densities of Betas" else NULL)
mtrace <- ggmcmc::ggs_traceplot(all_betas) +
facet_means +
ggplot2::labs(x = "Iteration",
y = "Value",
title = if (show_titles) "Traceplots of Betas" else NULL)
# Diagnostics transition rates
all_T <- convert_to_ggmcmc(x, pattern = "T")
n_t <- attributes(all_T)$nParameter
facet_t <- ggplot2::facet_wrap(~ Parameter, ncol = sqrt(n_t), scales = "free")
labels_t <- ggplot2::scale_y_continuous(labels = scales::number_format(accuracy = 0.01, decimal.mark = "."))
Ttrace <- ggmcmc::ggs_traceplot(all_T) +
facet_t +
labels_t +
ggplot2::labs(x = "Iteration",
y = "Value",
title = if (show_titles) "Traceplots of Parameters in Transition Matrix" else NULL)
Tdens <- ggmcmc::ggs_density(all_T) +
facet_t +
labels_t +
ggplot2::labs(x = "Value",
y = "Density",
title = if (show_titles) "Densities of Parameters in Transition Matrix" else NULL)
# Diagnostics alpha
df_alpha <- convert_to_ggmcmc(x, "alpha")
sdtrace <- ggmcmc::ggs_traceplot(df_alpha) +
ggplot2::labs(x = "Iteration",
y = "Value",
title = if (show_titles) "Traceplot of Alpha" else NULL)
sddens <- ggmcmc::ggs_density(df_alpha) +
ggplot2::labs(x = "Value",
y = "Density",
title = if (show_titles) "Density of Alpha" else NULL)
# Diagnostics mean
all_means <- convert_to_ggmcmc(x, pattern = "means")
n_means <- attributes(all_means)$nParameters
facet_me <- ggplot2::facet_wrap(~ Parameter, ncol = floor(n_means / 2), scales = "free")
medens <- ggmcmc::ggs_density(all_means) +
facet_me +
ggplot2::labs(x = "Value",
y = "Density",
title = if (show_titles) "Densities of Means" else NULL)
metrace <- ggmcmc::ggs_traceplot(all_means) +
facet_me +
ggplot2::labs(x = "Iteration",
y = "Value",
title = if (show_titles) "Traceplots of Means" else NULL)
# Likelihood trace
lltrace <- x$estimates$log_likelihood
lltrace_df <- as.data.frame(cbind(c((info$warmup + 1):info$iter), lltrace))
names(lltrace_df) <- c("iteration", "log_likelihood")
llplot <- ggplot2::ggplot(lltrace_df, ggplot2::aes_string(x = "iteration",
y = "log_likelihood")) +
ggplot2::geom_line() +
ggplot2::labs(x = "Iteration",
y = "Log-likelihood",
title = if (show_titles) "Traceplot of Log-Likelihood" else NULL)
# Confusion matrix
if (simulation) {
m_multi <- list("target" = true_states,
"prediction" = x$estimates$posterior_states)
conf_mat <- cvms::confusion_matrix(targets = m_multi$target,
predictions = m_multi$prediction)
conf_mat_plot <- suppressWarnings(
cvms::plot_confusion_matrix(conf_mat$`Confusion Matrix`[[1]],
add_sums = TRUE) +
ggplot2::labs(title = if (show_titles) "Confusion Matrix" else NULL)
)
}
# Check assignment of states along chromosome
n_data <- length(data)
post_means <- numeric(n_data)
if (!log_statesplot) {
states_df <- as.data.frame(cbind(1:n_data, data, x$estimates$posterior_states))
for (l in 1:length(data)) {
post_means[l] <- sum(x$estimates$alpha / x$estimates$betas * x$estimates$posterior_states_prob[l, ])
}
} else {
states_df <- as.data.frame(cbind(1:n_data, log(data + 1), x$estimates$posterior_states))
for (l in 1:n_data) {
post_means[l] <- log((sum(x$estimates$alpha / x$estimates$betas * x$estimates$posterior_states_prob[l, ])) + 1)
}
}
states_df$post_means <- post_means
names(states_df) <- c("position", "data", "posterior_states", "posterior_means")
states_df$posterior_states <- as.factor(states_df$posterior_states)
statesplot <- ggplot2::ggplot(states_df, ggplot2::aes_string(x = "position", y = "data")) +
ggplot2::geom_line(col = "grey") +
ggplot2::geom_point(ggplot2::aes_string(colour = "posterior_states"), shape = 20, size = 1.5, alpha = 0.75) +
ggplot2::geom_line(ggplot2::aes_string(x = "position", y = "posterior_means"), size = 0.15) +
ggplot2::guides(colour = ggplot2::guide_legend(title = "Post States")) +
ggplot2::labs(x = "Position",
y = "Data",
title = if (show_titles) "States Plot" else NULL)
if (simulation) {
if (!log_statesplot) {
states_df2 <- as.data.frame(cbind(1:n_data, data, true_states))
post_means <- (true_alpha / true_betas)[true_states]
states_df2$post_means <- post_means
}
else {
states_df2 <- as.data.frame(cbind(1:n_data, log(data + 1), true_states))
post_means <- (true_alpha / true_betas)[true_states]
states_df2$post_means <- log(post_means + 1)
}
states_df2 <- as.data.frame(cbind(1:n_data, data, true_states))
post_means <- (true_alpha / true_betas)[true_states]
states_df2$post_means <- post_means
names(states_df2) <- c("position", "data", "true_states", "true_means")
states_df2$true_states <- as.factor(states_df2$true_states)
statesplot2 <- ggplot2::ggplot(states_df2, ggplot2::aes_string(x = "position", y = "data")) +
ggplot2::geom_line(col = "grey") +
ggplot2::geom_point(ggplot2::aes_string(colour = "true_states"), shape = 20, size = 1.5, alpha = 0.75) +
ggplot2::geom_line(ggplot2::aes_string(x = "position", y = "true_means"), size = 0.15) +
ggplot2::guides(colour = ggplot2::guide_legend(title = "True States")) +
ggplot2::labs(x = "Position", y = "Data")
}
# Check fit
post_states <- x$estimates$posterior_states
state_tab <- table(post_states)
beta_est <- x$estimates$betas
alpha_est <- x$estimates$alpha
sim_output1 <- NA
for (j in 1:500) {
sim_output <- unlist(lapply(1:length(state_tab), function(i) {
stats::rnbinom(state_tab[i],
size = alpha_est,
prob = beta_est[i] / (1 + beta_est[i]))
}))
sim_output1 <- c(sim_output1, sim_output)
}
sim_output1 <- sim_output1[2:length(sim_output1)]
dens_data1 <- table(factor(data, levels = 0:max(c(data, sim_output1)))) / sum(table(factor(data, levels = 0:max(c(data, sim_output1)))))
dens_sim1 <- table(factor(sim_output1, levels = 0:max(c(data, sim_output1)))) / sum(table(factor(sim_output1, levels = 0:max(c(data, sim_output1)))))
kl_plot_f <- function() {
vcd::rootogram(as.numeric(dens_data1),
as.numeric(dens_sim1),
lines_gp = grid::gpar(col = "red", lwd = 1),
main = if (show_titles) "Model Fit" else NULL,
points_gp = grid::gpar(col = "black"),
pch = "",
ylab = "Frequency (sqrt)",
xlab = "Number of Occurrences")
}
dens_df <- as.data.frame(cbind(rep("observed", length(data)), data))
names(dens_df) <- c("data_type", "value")
dens_df$value <- as.numeric(dens_df$value)
klplot <- ggplot2::ggplot(dens_df, ggplot2::aes_string(x = "value")) +
ggplot2::geom_histogram(bins = floor(dim(table(factor(data, levels = 0:max(data))))),
fill = "grey", position = "identity") +
ggplot2::scale_color_manual(values = c("black")) +
ggplot2::geom_vline(xintercept = x$estimates$means, color = "black", size = 0.2) +
ggplot2::labs(x = "Number of Occurences",
y = "Frequency",
title = if (show_titles) "Observed Counts and Inferred Means" else NULL)
if (simulation) {
klplot <- ggplot2::ggplot(dens_df, ggplot2::aes_string(x = "value")) +
ggplot2::geom_histogram(bins = floor(dim(table(factor(data, levels = 0:max(data))))),
fill = "grey", position = "identity") +
ggplot2::scale_color_manual(values = c("black")) +
ggplot2::geom_vline(xintercept = x$estimates$means, color = "black", size = 0.6) +
ggplot2::geom_vline(xintercept = true_alpha / true_betas, color = "blue",
size = 0.4, linetype = "dotted") +
ggplot2::labs(x = "Number of Occurences",
y = "Frequency",
title = if (show_titles) "Observed Counts and Inferred (and True) Means" else NULL)
}
if (simulation) {
plotlist <- list(mtrace, mdens, Ttrace, Tdens, sdtrace, sddens, metrace,
medens, llplot, conf_mat_plot, klplot)
}
if (isFALSE(simulation)) {
plotlist <- list(mtrace, mdens, Ttrace, Tdens, sdtrace, sddens, metrace,
medens, llplot, statesplot, klplot)
}
for (ii in 1:length(plotlist)) {
if (simulation && ii == (length(plotlist) - 2)) {
gridExtra::grid.arrange(statesplot, statesplot2, nrow = 2)
}
invisible(utils::capture.output(print(plotlist[[ii]])))
}
kl_plot_f()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.