R/MCMC_poisson.R
In oHMMed: HMMs with Ordered Hidden States and Emission Densities

Documented in coef.hmm_mcmc_gamma_poisson hmm_mcmc_gamma_poisson hmm_simulate_gamma_poisson_data kullback_leibler_disc plot.hmm_mcmc_gamma_poisson posterior_prob_gamma_poisson

# Hidden Markov Model simulation with Gamma-Poisson data


# ******************************************************************************************************-----
# General Helpers: ------------------------------------------------------------------------------------------
# ******************************************************************************************************-----

#' Calculate a Kullback-Leibler Divergence for a Discrete Distribution
#' 
#' @param p (numeric) probabilities
#'
#' @param q (numeric) probabilities
#'
#' @details
#' The Kullback-Leibler divergence for a discrete distribution
#' is calculated as follows:
#' 
#' \deqn{\sum_{i=1}^n p_i \log\Big(\frac{p_i}{q_i}\Big)}
#'
#' @return
#' Numeric vector
#'
#' @export
#'
#' @examples
#' # Simulate n Poisson distributed variates
#' n <- 1000
#' dist1 <- rpois(n, lambda = 1)
#' dist2 <- rpois(n, lambda = 5)
#' dist3 <- rpois(n, lambda = 20)
#' 
#' # Generate common factor levels
#' x_max <- max(c(dist1, dist2, dist3))
#' all_levels <- 0:x_max
#' 
#' # Estimate probability mass functions 
#' pmf_dist1 <- table(factor(dist1, levels = all_levels)) / n
#' pmf_dist2 <- table(factor(dist2, levels = all_levels)) / n
#' pmf_dist3 <- table(factor(dist3, levels = all_levels)) / n
#' 
#' # Visualise PMFs
#' barplot(pmf_dist1, col = "green", xlim = c(0, x_max))
#' barplot(pmf_dist2, col = "red", add = TRUE)
#' barplot(pmf_dist3, col = "blue", add = TRUE)
#' 
#' # Calculate distances
#' kullback_leibler_disc(pmf_dist1, pmf_dist2)
#' kullback_leibler_disc(pmf_dist1, pmf_dist3)
#' kullback_leibler_disc(pmf_dist2, pmf_dist3)

kullback_leibler_disc <- function(p, q) {
  
  if (!is.numeric(p)) {
    stop("kullback_leibler_disc(): `p` must be numeric", call. = FALSE)
  }
  if (!is.numeric(q)) {
    stop("kullback_leibler_disc(): `q` must be numeric", call. = FALSE)
  }
  if (any(p < 0) | any(p > 1)) {
    warning("kullback_leibler_disc(): `p` must be between 0 and 1", call. = FALSE)
  }
  if (any(q < 0) | any(q > 1)) {
    warning("kullback_leibler_disc(): `q` must be between 0 and 1", call. = FALSE)
  }
  
  if (length(p) != length(q)) {
    stop("kullback_leibler_disc(): `p` must be the same length as `q`", call. = FALSE)
  }
  
  p <- p + 1e-12
  q <- q + 1e-12
  
  sum(p * log(p / q))
}

# ******************************************************************************************************-----
# Gamma-Poisson Specific Functions : ------------------------------------------------------------------------------------------
# ******************************************************************************************************-----


#' Simulate data distributed according to oHMMed with gamma-poisson emission densities
#'
#' @param L (integer) number of simulations
#'
#' @param mat_T (matrix) a square matrix with the initial state
#'
#' @param betas (numeric) \code{rate} parameter in \code{\link{rgamma}} for emission probabilities
#'
#' @param alpha (numeric) \code{shape} parameter in \code{\link{rgamma}} for emission probabilities
#'
#' @return
#' Returns a list with the following elements:
#' \itemize{
#'   \item \code{data}: numeric vector with data
#'   \item \code{states}: an integer vector with "true" hidden states used to generate the data vector
#'   \item \code{pi}: numeric vector with prior probability of states
#' } 
#'
#' @export
#'
#' @examples
#' mat_T <- rbind(c(1-0.01, 0.01, 0),
#'                c(0.01, 1-0.02, 0.01),
#'                c(0, 0.01, 1-0.01))
#' L <- 2^7
#' betas <- c(0.1, 0.3, 0.5)
#' alpha <- 1
#' 
#' sim_data <- hmm_simulate_gamma_poisson_data(L = L,
#'                                             mat_T = mat_T,
#'                                             betas = betas,
#'                                             alpha = alpha)
#' hist(sim_data$data, 
#'      breaks = 40,
#'      main = "Histogram of Simulated Gamma-Poisson Data", 
#'      xlab = "")
#' sim_data

hmm_simulate_gamma_poisson_data <- function(L, mat_T, betas, alpha) {
  
  if (length(alpha) != 1) {
    stop("hmm_simulate_gamma_poisson_data(): shape parameter `alpha` must be of length 1", call. = FALSE)
  }
  
  if (alpha < 0) {
    stop("hmm_simulate_gamma_poisson_data(): shape parameter `alpha` must be non-negative", call. = FALSE)
  }
  
  if (any(betas < 0)) {
    stop("hmm_simulate_gamma_poisson_data(): rate parameters `betas` must be non-negative", call. = FALSE)
  }
  
  if (!is.matrix(mat_T)) {
    stop("hmm_simulate_gamma_poisson_data(): `mat_T` must be a numeric matrix", call. = FALSE)
  }
  
  if (any(is_row_sum_one_(mat_T) == FALSE)) {
    stop("hmm_simulate_gamma_poisson_data(): rows in the transition matrix `mat_T` must sum up to 1", call. = FALSE)
  }
  
  if (length(L) != 1) {
    stop("hmm_simulate_gamma_poisson_data(): the number of simulations `L` must be a single integer", call. = FALSE)
  }
  
  vstate <- rep(NA, L)
  vemit <- rep(NA, L)
  
  n_states <- nrow(mat_T)
  pi <- get_pi_(mat_T)
  
  vstate[1] <- sample.int(n = n_states, size = 1, replace = TRUE, prob = pi)
  vemit[1] <- stats::rpois(1, stats::rgamma(1, shape = alpha, rate = betas[vstate[1]]))
  
  for(i in 2:L) {
    p_state <- mat_T[vstate[i - 1], ]
    vstate[i] <- sample.int(n = n_states, size = 1, replace = TRUE, prob = p_state)
    vemit[i] <- stats::rpois(1, stats::rgamma(1, shape = alpha, rate = betas[vstate[i]]))
  }
  
  list("data" = vemit,
       "states" = factor(vstate, levels = 1:n_states),
       "pi" = pi)
}


#' Forward-Backward Algorithm to Calculate the Posterior Probabilities of Hidden States in Poisson-Gamma Model
#'
#' Forward-Backward Algorithm to Calculate the Posterior Probabilities of Hidden States in Poisson-Gamma Model
#'
#' @param data (numeric) Poisson data
#'
#' @param pi (numeric) prior probability of states
#'
#' @param mat_T (matrix) transition probability matrix
#'
#' @param betas (numeric) vector with prior rates
#'
#' @param alpha (numeric) prior scale
#'
#' @details
#' Please see \href{https://static-content.springer.com/esm/art\%3A10.1186\%2Fs12859-024-05751-4/MediaObjects/12859_2024_5751_MOESM1_ESM.pdf}{supplementary information} at \doi{10.1186/s12859-024-05751-4} for more details on the algorithm.
#'
#' @return
#' List with the following elements: 
#' \itemize{
#'   \item \code{F}: auxiliary forward variables
#'   \item \code{B}: auxiliary backward variables
#'   \item \code{s}: weights
#' }
#'
#' @export
#'
#' @examples
#' mat_T <- rbind(c(1-0.01,0.01,0),
#'                c(0.01,1-0.02,0.01),
#'                c(0,0.01,1-0.01))
#' L <- 2^10
#' betas <- c(0.1, 0.3, 0.5)
#' alpha <- 1
#'
#' sim_data <- hmm_simulate_gamma_poisson_data(L = L,
#'                                             mat_T = mat_T,
#'                                             betas = betas,
#'                                             alpha = alpha)
#' pi <- sim_data$pi
#' hmm_poison_data <- sim_data$data
#' hist(hmm_poison_data, breaks = 100)
#'
#' # Calculate posterior probabilities of hidden states
#' post_prob <- posterior_prob_gamma_poisson(data = hmm_poison_data,
#'                                           pi = pi,
#'                                           mat_T = mat_T,
#'                                           betas = betas,
#'                                           alpha = alpha)
#' str(post_prob)

posterior_prob_gamma_poisson <- function(data, pi, mat_T, betas, alpha) {
  
  cap_ <- .Machine$double.xmax
  floor_ <- .Machine$double.xmin
  
  L <- length(data)
  n_states <- nrow(mat_T)
  mat_F <- matrix(ncol = n_states, nrow = L)
  vs <- rep(1, L)
  
  pi[pi < floor_] <- floor_
  nb_prob <- betas / (1 + betas)
  
  vtemp <- cap_floor_(pi * stats::dnbinom(x = data[1], prob = nb_prob, size = alpha), cap_, floor_)
  vs[1] <- sum(vtemp)
  mat_F[1,] <- vtemp / vs[1]
  
  # Forward
  for (l in 2:L) {
    probs <- cap_floor_(stats::dnbinom(x = data[l], prob = nb_prob, size = alpha), cap_, floor_)
    vtemp <- mat_F[l - 1,] %*% mat_T * probs
    vs[l] <- sum(vtemp)
    mat_F[l, ] <- vtemp / vs[l]
  }
  
  # Backward
  mat_B <- matrix(nrow = L, ncol = n_states)
  mat_B[L, ] <- rep(1, n_states) / vs[L]
  
  for (l in L:2) {
    probs <- cap_floor_(stats::dnbinom(x = data[l], prob = nb_prob, size = alpha), cap_, floor_)
    mat_B[l - 1, ] <- mat_T %*% (probs * mat_B[l, ]) / vs[l - 1]
  }
  
  list(F = mat_F, B = mat_B, s = vs)
}


#' @keywords internal
sample_states_pois_ <- function(mat_R, mat_T, pi, betas, alpha, data) {
  
  L <- length(mat_R$F[ ,1])
  n_states <- length(mat_R$F[1, ])
  states <- rep(NA, L)
  cap_ <- .Machine$double.xmax
  floor_ <- .Machine$double.xmin
  
  p <- cap_floor_(mat_R$F[1, ] * mat_R$F[1, ] * mat_R$s[1], cap_, floor_)
  p <- p / sum(p)
  p[is.na(p)] <- floor_
  all_runif <- stats::runif(L) 
  states[1] <- which.max(cumsum(p) - all_runif[1] > 0) 
  
  
  nb_prob <- betas / (1 + betas)
  probs <- matrix(NA, nrow = L, ncol = length(betas))
  for (i in 1:L) {
    probs[i, ] <- nb_prob^alpha * (1 - nb_prob)^data[i]
  }
  probs <- cap_floor_(probs, cap_, floor_)
  
  for (l in 2:L) {
    p <- mat_T[states[l - 1], ] * probs[l, ] * mat_R$B[l, ]
    p <- p / sum(p)
    p[is.na(p)] <- floor_
    states[l] <- which.max(cumsum(p) - all_runif[l] > 0) 
  }
  factor(states, levels = 1:n_states)
}


#' @keywords internal
sample_betas_alpha_ <- function(cur_betas, cur_alpha, prior_betas, prior_alpha, states = NULL, data = NULL) {
  
  N <- length(data)
  betas <- NULL
  alpha <- NULL
  n_states <- length(cur_betas)
  loglambda_p <- prior_alpha 
  
    lvl <- list()
    for (i in 1:n_states) {
      vl <- data[states == i]
      N_i <- length(vl)
      vl <- stats::rgamma(N_i, shape = cur_alpha + vl, rate = cur_betas[i] + 1)
      lvl[[i]] <- vl
      rate_gamma <- 1
      post_shape <- cur_alpha * N_i + 1
      post_rate <- sum(lvl[[i]]) + 1 / prior_betas[i]
      betas[i] <- stats::rgamma(1, shape = post_shape, rate = post_rate)
    }
    
    betas <- sort(betas, decreasing = TRUE) 
    for (i in 1:n_states) {
      if (N > 0) {
        loglambda_p <- loglambda_p + sum(log(betas[i] * lvl[[i]]))
      }
    }  
    
    HM <- exp(loglambda_p / (N + 1))
    shape_param <- (N + prior_alpha) * cur_alpha
    
    alpha_star <- stats::rgamma(1, shape = shape_param, rate = N)
    lh_star <- 0
    lh_old <- 0
    for (i in 1:n_states) {
      lh_star <- lh_star + (log(HM) * alpha_star - lgamma(alpha_star)) * N
      lh_old <- lh_old + (log(HM) * cur_alpha - lgamma(cur_alpha)) * N
    }
    
    d1 <- stats::dgamma(alpha_star, shape = shape_param, rate = N, log = TRUE)
    d2 <- stats::dgamma(cur_alpha, shape = shape_param, rate = N, log = TRUE)
    lr <- lh_star - d1 - (lh_old - d2)
    alpha <- cur_alpha

    if (stats::runif(1) <= exp(lr)) {
      alpha <- alpha_star
    }

  list(betas = betas, alpha = alpha)
}


#' @keywords internal
init_hmm_mcmc_gamma_poisson_ <- function(data, prior_T, prior_betas, prior_alpha,
                                         init_T, init_betas, init_alpha, 
                                         verbose, iter, warmup, thin, chain_id = NULL) {
  
  if (iter < 2) {
    stop("hmm_mcmc_gamma_poisson(): `iter` needs to be bigger than 1", call. = FALSE)
  }
  
  if (!is.integer(data)) {
    stop("hmm_mcmc_gamma_poisson(): `data` needs to be an integer vector", call. = FALSE)
  }
  
  if (sum(is.na(data)) > 0) {
    stop("hmm_mcmc_gamma_poisson(): `data` contains missing values", call. = FALSE)
  }
  
  if (any(is_row_sum_one_(prior_T) == FALSE)) {
    stop("hmm_mcmc_gamma_poisson(): rows in the transition matrix `prior_T` must sum up to 1", call. = FALSE)
  }
  if (length(prior_betas) != nrow(prior_T) | length(prior_betas) != ncol(prior_T)) {
    stop("hmm_mcmc_gamma_poisson(): number of states is not the same between input variables", call. = FALSE)
  }
  
  if (length(prior_alpha) != 1) {
    stop("hmm_mcmc_gamma_poisson(): `prior_alpha` must be of length 1", call. = FALSE)
  }
  if (warmup >= iter + 2) {
    stop("hmm_mcmc_gamma_poisson(): `warmup` must be lower than `iter`", call. = FALSE)
  }
  if (thin > iter - warmup) {
    stop("hmm_mcmc_gamma_poisson(): `thin` cannot exceed iterations after warmup period", call. = FALSE)
  }
  
  # Initialization
    if (is.null(init_betas)) {
    init_betas <- stats::rgamma(length(prior_betas), shape = 1, rate = 1 / prior_betas)
  }
  
  if (is.null(init_alpha)) {
    init_alpha <- stats::rgamma(1, shape = prior_alpha, rate = 1)
  }
  
  if (is.null(init_T)) {
    init_T <- sample_T_(prior_mat = prior_T)
  }
  
  if (length(init_betas) != nrow(init_T) | length(init_betas) != ncol(init_T)) {
    stop("hmm_mcmc_gamma_poisson(): number of states is not the same between input variables", call. = FALSE)
  }
  
  if (length(init_alpha) != 1) {
    stop("hmm_mcmc_gamma_poisson(): `init_alpha` must be of length 1", call. = FALSE)
  }
  
  if (any(is_row_sum_one_(init_T) == FALSE)) {
    stop("hmm_mcmc_gamma_poisson(): rows in the transition matrix `init_T` must sum up to 1", call. = FALSE)
  }
  
  lambda1 <- sum(data) / length(data)
  lambda2 <- numeric(length(init_betas)) 
  for (i in 1:length(init_betas)) { 
    lambda2[i] <- mean(stats::rgamma(1000, init_alpha, init_betas[i]))
  }
  
  chain_char <- NULL
  if (!is.null(chain_id)) {
    chain_char <- paste0("(chain ", chain_id, ") ")
  }
  
  if (verbose) {
     set_alpha <- (mean(data)^2) / ((stats::var(data) - mean(data)) / length(init_betas))
     if( (init_alpha > (set_alpha + 3)) | (init_alpha < (set_alpha - 3)) ){
       message("hmm_mcmc_gamma_poisson(): ", chain_char, "initial alpha is not close to the recommended value")
     }
    if (any(lambda2 > stats::var(data))) {
      message("hmm_mcmc_gamma_poisson(): ", chain_char, "at least one initial rate parameter is greater than the overall variance")
    }
  }
  
  init_pi <- get_pi_(init_T)
  
  init_mat_res <- posterior_prob_gamma_poisson(data = data,
                                               pi = init_pi,
                                               mat_T = init_T,
                                               betas = init_betas,
                                               alpha = init_alpha)
  
  init_states <- sample_states_pois_(mat_R = init_mat_res,
                                     pi = init_pi,
                                     mat_T = init_T,
                                     betas = init_betas,
                                     alpha = init_alpha,
                                     data = data)
  
  list(init_states = init_states, init_betas = init_betas,
       init_alpha = init_alpha, init_T = init_T, init_pi = init_pi,
       init_mat_res = init_mat_res)
}


#' MCMC Sampler sampler for the Hidden Markov with Gamma-Poisson emission densities
#'
#' @param data (numeric) data
#'
#' @param prior_T (matrix) prior transition matrix
#'
#' @param prior_betas (numeric) prior beta parameters
#'
#' @param prior_alpha (numeric) a single prior alpha parameter. By default, \code{prior_alpha=1}
#'
#' @param iter (integer) number of MCMC iterations
#'
#' @param warmup (integer) number of warmup iterations
#'
#' @param thin (integer) thinning parameter. By default, \code{1}
#'
#' @param seed (integer) seed parameter
#'
#' @param init_T (matrix) \code{optional parameter}; initial transition matrix
#'
#' @param init_betas (numeric) \code{optional parameter}; initial beta parameters
#'
#' @param init_alpha (numeric) \code{optional parameter}; initial alpha parameter
#'
#' @param print_params (logical) \code{optional parameter}; print estimated parameters every iteration. By default, \code{TRUE}
#'
#' @param verbose (logical) \code{optional parameter}; print additional messages. By default, \code{TRUE}
#'
#' @details
#' Please see \href{https://static-content.springer.com/esm/art\%3A10.1186\%2Fs12859-024-05751-4/MediaObjects/12859_2024_5751_MOESM1_ESM.pdf}{supplementary information} at \doi{10.1186/s12859-024-05751-4} for more details on the algorithm.
#' 
#' For usage recommendations please see \url{https://github.com/LynetteCaitlin/oHMMed/blob/main/UsageRecommendations.pdf}.
#' 
#' @references
#' Claus Vogl, Mariia Karapetiants, Burçin Yıldırım, Hrönn Kjartansdóttir, Carolin Kosiol, Juraj Bergman, Michal Majka, Lynette Caitlin Mikula.
#' Inference of genomic landscapes using ordered Hidden Markov Models with emission densities (oHMMed).
#' BMC Bioinformatics 25, 151 (2024). \doi{10.1186/s12859-024-05751-4}
#'
#' @return
#' List with following elements:
#' \itemize{
#'   \item \code{data}: data used for simulation
#'   \item \code{samples}: list with samples
#'   \item \code{estimates}: list with various estimates
#'   \item \code{idx}: indices with iterations after the warmup period
#'   \item \code{priors}: prior parameters
#'   \item \code{inits}: initial parameters
#'   \item \code{last_iter}: list with samples from the last MCMC iteration
#'   \item \code{info}: list with various meta information about the object
#' }
#'
#' @export
#'
#' @examples
#' # Simulate Poisson-Gamma data
#' N <- 2^10
#' true_T <- rbind(c(0.95, 0.05, 0),
#'                 c(0.025, 0.95, 0.025),
#'                 c(0.0, 0.05, 0.95))
#'
#' true_betas <- c(2, 1, 0.1)
#' true_alpha <- 1
#'
#' simdata_full <- hmm_simulate_gamma_poisson_data(L = N,
#'                                                 mat_T = true_T,
#'                                                 betas = true_betas,
#'                                                 alpha = true_alpha)
#' simdata <- simdata_full$data
#' hist(simdata, breaks = 40, probability = TRUE,  
#'      main = "Distribution of the simulated Poisson-Gamma data")
#' lines(density(simdata), col = "red")
#' 
#' # Set numbers of states to be inferred
#' n_states_inferred <- 3
#' 
#' # Set priors
#' prior_T <- generate_random_T(n_states_inferred)
#' prior_betas <- c(1, 0.5, 0.1)
#' prior_alpha <- 3
#' 
#' # Simmulation settings
#' iter <- 50
#' warmup <- floor(iter / 5) # 20 percent
#' thin <- 1
#' seed <- sample.int(10000, 1)
#' print_params <- FALSE # if TRUE then parameters are printed in each iteration
#' verbose <- FALSE # if TRUE then the state of the simulation is printed
#'
#' # Run MCMC sampler
#' res <- hmm_mcmc_gamma_poisson(data = simdata,
#'                               prior_T = prior_T,
#'                               prior_betas = prior_betas,
#'                               prior_alpha = prior_alpha,
#'                               iter = iter,
#'                               warmup = warmup,  
#'                               thin = thin,
#'                               seed = seed,
#'                               print_params = print_params,
#'                               verbose = verbose)
#' res
#'
#' summary(res)# summary output can be also assigned to a variable
#' 
#' coef(res) # extract model estimates
#' 
#' # plot(res) # MCMC diagnostics

hmm_mcmc_gamma_poisson <- function(data,
                                   prior_T,
                                   prior_betas,
                                   prior_alpha = 1,
                                   iter = 5000,
                                   warmup = floor(iter / 1.5),
                                   thin = 1,
                                   seed = sample.int(.Machine$integer.max, 1),
                                   init_T = NULL,
                                   init_betas = NULL,
                                   init_alpha = NULL,
                                   print_params = TRUE,
                                   verbose = TRUE) {
  
  set.seed(seed)
  
  init_data <- init_hmm_mcmc_gamma_poisson_(data, prior_T, prior_betas, prior_alpha,
                                            init_T, init_betas, init_alpha,
                                            verbose, iter, warmup, thin)
  
  n_data <- length(data)
  n_states <- length(prior_betas)
  
  mean_states <- matrix(0, ncol = n_states, nrow = n_data)
  all_betas <- matrix(0, ncol = n_states, nrow = iter)
  all_alpha <- numeric(iter)
  all_mat_T <- array(dim = c(dim(prior_T), iter))
  vlh <- numeric(iter)
  
  all_betas[1, ] <- init_data$init_betas
  all_alpha[1] <- init_data$init_alpha
  all_mat_T[, ,1] <- init_data$init_T
  vlh[1] <- sum(log(init_data$init_mat_res$s))
  states <- init_data$init_states
  betas <- init_data$init_betas
  alpha <- init_data$init_alpha
  
  # Run sampler
  for (it in 2:iter) {
    
    print_progress_(it, iter, verbose = verbose)
    m <- sample_betas_alpha_(cur_betas = betas,
                             cur_alpha = alpha,
                             prior_betas = prior_betas,
                             prior_alpha = prior_alpha,
                             states = states,
                             data = data)
    
    betas <- m$betas
    alpha <- m$alpha
    mat_T <- sample_T_(states, prior_mat = prior_T)
    pi <- get_pi_(mat_T)
    mat_res <- posterior_prob_gamma_poisson(data = data,
                                            pi = pi,
                                            mat_T = mat_T,
                                            betas = betas,
                                            alpha = alpha)
    
    states <- sample_states_pois_(mat_R = mat_res,
                                  pi = pi,
                                  mat_T = mat_T,
                                  betas = betas,
                                  alpha = alpha,
                                  data = data)
    if (iter > warmup) {
      for (i in 1:n_states) {
        mean_states[ ,i] <- mean_states[ ,i] + (states == i)
      }
    }
    
    all_betas[it, ] <- betas
    all_alpha[it] <- alpha
    all_mat_T[ , ,it] <- mat_T
    vlh[it] <- sum(log(mat_res$s))
    
    if (print_params) {
      print(c(colMeans(all_betas[1:it, ]), mean(all_alpha[1:it])))
    }
  }
  
  # Prepare outputs
  idx <- seq.int(warmup + 1, by = thin, to = iter)
  
  colnames(all_betas) <- paste0("beta[", 1:n_states,"]")
  all_means <- all_alpha / all_betas
  colnames(all_means) <- paste0("means[", 1:n_states,"]")
  
  samples <- list(betas = all_betas,
                  alpha = all_alpha,
                  means = all_means,
                  mat_T = all_mat_T,
                  vlh = vlh)
  
  posterior_states <- factor(max.col(mean_states, "first"), 1:n_states)
  
  estimates <- list(betas = colMeans(all_betas[idx, ]),
                    alpha = mean(all_alpha[idx]),
                    means = colMeans(all_means[idx, ]),
                    mat_T = apply(all_mat_T[ , ,idx], c(1,2), mean),
                    posterior_states = posterior_states,
                    posterior_states_prob = mean_states / iter,
                    log_likelihood = vlh[idx])    
  
  priors <- list(prior_betas = prior_betas,
                 prior_alpha = prior_alpha,
                 prior_T = prior_T)
  
  inits <- list(init_states = init_data$init_states,
                init_betas = init_data$init_betas,
                init_alpha = init_data$init_alpha,
                init_T = init_data$init_T)
  
  last_iter <- list(betas = all_betas[iter, ],
                    alpha = all_alpha[iter],
                    means = all_means[iter, ],
                    mat_T = all_mat_T[ , ,iter])
  
  info <- list(model_name = "hmm_mcmc_gamma_poisson",
               date = as.character(Sys.time()),
               seed = seed,
               iter = iter,
               warmup = warmup,
               thin = thin,
               n_states = length(prior_betas))
  
  res <- list(data = data,
              samples = samples,
              estimates = estimates,
              idx = idx,
              priors = priors,
              inits = inits,
              last_iter = last_iter,
              info = info)
  
  class(res) <- "hmm_mcmc_gamma_poisson"
  res$info$object_size <- format(utils::object.size(res), "Mb")
  res
}


#' @keywords internal
#' @export

print.hmm_mcmc_gamma_poisson <- function(x, ...) {
  info <- x$info
  cat("Model:", "HMM Gamma-Poisson", "\n")
  cat("Type:", "MCMC", "\n")
  cat("Iter:", info$iter, "\n")
  cat("Warmup:", info$warmup, "\n")
  cat("Thin:", info$thin, "\n")
  cat("States:", length(x$priors$prior_betas), "\n")
}


#' @keywords internal
#' @export

summary.hmm_mcmc_gamma_poisson <- function(object, ...) {
  
  info <- object$info
  idx <- object$idx
  data <- object$data
  
  beta_est <- object$estimates$betas
  alpha_est <- object$estimates$alpha
  T_est <- object$estimates$mat_T
  
  means_est <- object$estimates$means
  
  post_states <- object$estimates$posterior_states
  state_tab <- table(post_states, dnn = "")
  
  kl_list <- rep(NA, 500)
  for (j in 1:500) {
    sim_output <- unlist(lapply(1:length(state_tab), function(i) {
      stats::rnbinom(state_tab[i],
                     size = alpha_est,
                     prob = beta_est[i] / (1 + beta_est[i]))
    }))
    jth_levels <- 0:max(c(data, sim_output))
    dens_data_table <- table(factor(data, levels = jth_levels))
    dens_sim_table <- table(factor(sim_output, levels = jth_levels))
    
    dens_data <- dens_data_table / sum(dens_data_table)
    dens_sim <- dens_sim_table / sum(dens_sim_table)
    
    kl_list[j] <- kullback_leibler_disc(dens_data, dens_sim)
  }
  
  kl_div <- mean(kl_list)
  ll_info <- c(mean(object$estimates$log_likelihood),
               stats::sd(object$estimates$log_likelihood),
               stats::median(object$estimates$log_likelihood))
  names(ll_info) <- c("mean", "sd", "median")
  
  group_comparison <- rep(NA, length(beta_est) - 1)
  for (k in 1:(length(beta_est) - 1)) {
    gr1 <- object$data[object$estimates$posterior_states == k]
    gr2 <- object$data[object$estimates$posterior_states == (k + 1)]
    names(group_comparison)[k] <- paste0(k, "-", k + 1)
    if (length(gr1) == 0 | length(gr2) == 0) {
      next
    } 
    group_comparison[k] <- stats::poisson.test(x = c(sum(gr1), sum(gr2)), 
                                               T = c(length(gr1), length(gr2)), 
                                               alternative = "l")$p.value
  }
  
  summary_res <- list("estimated_betas" = beta_est,
                      "estimated_alpha" = alpha_est,
                      "estimated_means" = means_est,
                      "estimated_transition_rates" = T_est,
                      "assigned_states" = state_tab,
                      "approximate_kullback_leibler_divergence" = kl_div,
                      "log_likelihood" = ll_info,
                      "state_differences_significance" = group_comparison)
  
  cat("Estimated betas:\n")
  print(summary_res$estimated_betas)
  cat("\n")
  
  if (stats::sd(object$samples$alpha) > 0) {
    cat("Estimated alpha:\n")
    cat(summary_res$estimated_alpha)
    cat("\n")
    cat("\n")
  }
  
  cat("Estimated means:\n")
  cat(summary_res$estimated_means)
  cat("\n")
  cat("\n")
  
  cat("Estimated transition rates:\n")
  etr <- summary_res$estimated_transition_rates
  rownames(etr) <- colnames(etr) <- names(summary_res$assigned_states)
  print(etr)
  cat("\n")
  
  cat("Number of windows assigned to hidden states:\n")
  as <- summary_res$assigned_states
  as_names <- attributes(as)$dimnames[[1]]
  print(stats::setNames(as.numeric(as), as_names))
  cat("\n")
  
  cat("Kullback-Leibler divergence between observed and estimated distributions:\n")
  cat(stats::setNames(summary_res$approximate_kullback_leibler_divergence, ""))
  cat("\n")
  cat("\n")
  
  cat("Log Likelihood:\n")
  print(summary_res$log_likelihood)
  cat("\n")
  
  cat("P-value of poisson test for difference between rates of states (stepwise):\n")
  print(summary_res$state_differences_significance)
  cat("\n")
  
  invisible(summary_res)
}


#' @rdname coef.hmm_mcmc_normal
#'
#' @export
#' @export coef.hmm_mcmc_gamma_poisson
#'
#' @examples
#' coef(example_hmm_mcmc_gamma_poisson)

coef.hmm_mcmc_gamma_poisson <- function(object, ...) {
  est <- object$estimates
  list(betas = est$betas,
       alpha = est$alpha,
       means = est$means,
       mat_T = est$mat_T)
}


#' Plot Diagnostics for \code{hmm_mcmc_gamma_poisson} Objects
#' 
#' This function creates a variety of diagnostic plots that can be useful when 
#' conducting Markov Chain Monte Carlo (MCMC) simulation of a gamma-poisson hidden Markov model (HMM). 
#' These plots will help to assess convergence, fit, and performance of the MCMC simulation
#'
#' @param x (hmm_mcmc_gamma_poisson) HMM MCMC gamma-poisson object
#'
#' @param simulation (logical); default is \code{simulation=FALSE}, so the input data was empirical. If the input data was simulated, it must be set \code{simulation=TRUE}. 
#'
#' @param true_betas (numeric) true betas. To be used if \code{simulation=TRUE}
#'
#' @param true_alpha (numeric) true alpha. To be used if \code{simulation=TRUE}
#'
#' @param true_mat_T (matrix) \code{optional parameter}; true transition matrix. To be used if \code{simulation=TRUE}
#'
#' @param true_states (integer) \code{optional parameter}; true states. To be used if \code{simulation=TRUE}
#' 
#' @param show_titles (logical) if \code{TRUE} then titles are shown for all graphs. By default, \code{TRUE}
#' 
#' @param log_statesplot (logical) if \code{TRUE} then log-statesplots are shown. By default, \code{FALSE}
#'
#' @param ... not used
#'
#' @return
#' Several diagnostic plots that can be used to evaluate the MCMC simulation
#' of the gamma-poisson HMM
#'
#' @export
#' @export plot.hmm_mcmc_gamma_poisson
#'
#' @examples
#' \donttest{
#' plot(example_hmm_mcmc_gamma_poisson)
#' }

plot.hmm_mcmc_gamma_poisson <- function(x,
                                        simulation = FALSE,
                                        true_betas = NULL,
                                        true_alpha = NULL,
                                        true_mat_T = NULL,
                                        true_states = NULL,
                                        show_titles = TRUE,
                                        log_statesplot = FALSE,
                                        ...) {
  
  info <- x$info
  data <- x$data
  idx <- x$idx
  
  if (simulation) {
    cond <- any(c(is.null(true_betas), is.null(true_alpha), is.null(true_mat_T),
                  is.null(true_states)))
    if (cond) {
      stop("plot.hmm_mcmc_gamma_poisson(): if `simulation=TRUE` then `true_betas` ",
           "`true_alpha`, `true_mat_T` and `true_states` must be defined", call. = FALSE)
    }
  }
  
  # Diagnostics betas
  all_betas <- convert_to_ggmcmc(x, pattern = "beta")
  n_betas <- attributes(all_betas)$nParameters
  facet_means <- ggplot2::facet_wrap(~ Parameter, ncol = floor(n_betas / 2), scales = "free")
  
  mdens <- ggmcmc::ggs_density(all_betas) + 
    facet_means + 
    ggplot2::labs(x = "Value", 
                  y = "Density", 
                  title = if (show_titles) "Densities of Betas" else NULL)
  
  mtrace <- ggmcmc::ggs_traceplot(all_betas) + 
    facet_means + 
    ggplot2::labs(x = "Iteration", 
                  y = "Value", 
                  title = if (show_titles) "Traceplots of Betas" else NULL)
  
  # Diagnostics transition rates
  all_T <- convert_to_ggmcmc(x, pattern = "T")
  n_t <- attributes(all_T)$nParameter
  facet_t <- ggplot2::facet_wrap(~ Parameter, ncol = sqrt(n_t), scales = "free")
  labels_t <- ggplot2::scale_y_continuous(labels = scales::number_format(accuracy = 0.01, decimal.mark = "."))
  
  Ttrace <- ggmcmc::ggs_traceplot(all_T) + 
    facet_t + 
    labels_t + 
    ggplot2::labs(x = "Iteration", 
                  y = "Value",
                  title = if (show_titles) "Traceplots of Parameters in Transition Matrix" else NULL)
  
  Tdens <- ggmcmc::ggs_density(all_T) + 
    facet_t + 
    labels_t + 
    ggplot2::labs(x = "Value", 
                  y = "Density",
                  title = if (show_titles) "Densities of Parameters in Transition Matrix" else NULL)
  
  # Diagnostics alpha
  df_alpha <- convert_to_ggmcmc(x, "alpha")
  sdtrace <- ggmcmc::ggs_traceplot(df_alpha) + 
    ggplot2::labs(x = "Iteration", 
                  y = "Value", 
                  title = if (show_titles) "Traceplot of Alpha" else NULL)
  
  sddens <- ggmcmc::ggs_density(df_alpha) +
    ggplot2::labs(x = "Value", 
                  y = "Density",
                  title = if (show_titles) "Density of Alpha" else NULL)
  
  # Diagnostics mean
  all_means <- convert_to_ggmcmc(x, pattern = "means")
  n_means <- attributes(all_means)$nParameters
  facet_me <- ggplot2::facet_wrap(~ Parameter, ncol = floor(n_means / 2), scales = "free")
  
  medens <- ggmcmc::ggs_density(all_means) + 
    facet_me + 
    ggplot2::labs(x = "Value", 
                  y = "Density",
                  title = if (show_titles) "Densities of Means" else NULL)
  
  metrace <- ggmcmc::ggs_traceplot(all_means) + 
    facet_me + 
    ggplot2::labs(x = "Iteration", 
                  y = "Value",
                  title = if (show_titles) "Traceplots of Means" else NULL)
  
  # Likelihood trace
  lltrace <- x$estimates$log_likelihood
  lltrace_df <- as.data.frame(cbind(c((info$warmup + 1):info$iter), lltrace))
  names(lltrace_df) <- c("iteration", "log_likelihood")
  
  llplot <- ggplot2::ggplot(lltrace_df, ggplot2::aes_string(x = "iteration", 
                                                            y = "log_likelihood")) +
    ggplot2::geom_line() +
    ggplot2::labs(x = "Iteration", 
                  y = "Log-likelihood",
                  title = if (show_titles) "Traceplot of Log-Likelihood" else NULL)
  
  # Confusion matrix
  if (simulation) {
    m_multi <- list("target" = true_states,
                    "prediction" = x$estimates$posterior_states)
    conf_mat <- cvms::confusion_matrix(targets = m_multi$target,
                                       predictions = m_multi$prediction)
    
    conf_mat_plot <- suppressWarnings(
      cvms::plot_confusion_matrix(conf_mat$`Confusion Matrix`[[1]],
                                  add_sums = TRUE) +
        ggplot2::labs(title = if (show_titles) "Confusion Matrix" else NULL)
    )
  }
  
  # Check assignment of states along chromosome
  n_data <- length(data)
  post_means <- numeric(n_data)
  if (!log_statesplot) {
    states_df <- as.data.frame(cbind(1:n_data, data, x$estimates$posterior_states))
    for (l in 1:length(data)) {
      post_means[l] <- sum(x$estimates$alpha / x$estimates$betas * x$estimates$posterior_states_prob[l, ])
    }
  } else {
    states_df <- as.data.frame(cbind(1:n_data, log(data + 1), x$estimates$posterior_states))
    for (l in 1:n_data) {
      post_means[l] <- log((sum(x$estimates$alpha / x$estimates$betas * x$estimates$posterior_states_prob[l, ])) + 1)
    }
  }
 
  states_df$post_means <- post_means
  names(states_df) <- c("position", "data", "posterior_states", "posterior_means")
  states_df$posterior_states <- as.factor(states_df$posterior_states)
  
  statesplot <- ggplot2::ggplot(states_df, ggplot2::aes_string(x = "position", y = "data")) +
    ggplot2::geom_line(col = "grey") +
    ggplot2::geom_point(ggplot2::aes_string(colour = "posterior_states"), shape = 20, size = 1.5, alpha = 0.75) +
    ggplot2::geom_line(ggplot2::aes_string(x = "position", y = "posterior_means"), size = 0.15) +
    ggplot2::guides(colour = ggplot2::guide_legend(title = "Post States")) +
    ggplot2::labs(x = "Position", 
                  y = "Data",
                  title = if (show_titles) "States Plot" else NULL)
  
  if (simulation) {
    if (!log_statesplot) {
      states_df2 <- as.data.frame(cbind(1:n_data, data, true_states))
      post_means <- (true_alpha / true_betas)[true_states]
      states_df2$post_means <- post_means
    }
    else {
      states_df2 <- as.data.frame(cbind(1:n_data, log(data + 1), true_states))
      post_means <- (true_alpha / true_betas)[true_states]
      states_df2$post_means <- log(post_means + 1)
    }
    states_df2 <- as.data.frame(cbind(1:n_data, data, true_states))
    post_means <- (true_alpha / true_betas)[true_states]
    states_df2$post_means <- post_means
    names(states_df2) <- c("position", "data", "true_states", "true_means")
    states_df2$true_states <- as.factor(states_df2$true_states)
    
    statesplot2 <- ggplot2::ggplot(states_df2, ggplot2::aes_string(x = "position", y = "data")) +
      ggplot2::geom_line(col = "grey") +
      ggplot2::geom_point(ggplot2::aes_string(colour = "true_states"), shape = 20, size = 1.5, alpha = 0.75) +
      ggplot2::geom_line(ggplot2::aes_string(x = "position", y = "true_means"), size = 0.15) +
      ggplot2::guides(colour = ggplot2::guide_legend(title = "True States")) +
      ggplot2::labs(x = "Position", y = "Data")
  }
  
  # Check fit
  post_states <- x$estimates$posterior_states
  state_tab <- table(post_states)
  beta_est <- x$estimates$betas
  alpha_est <- x$estimates$alpha
  
  sim_output1 <- NA   
  for (j in 1:500) {
    sim_output <- unlist(lapply(1:length(state_tab), function(i) {
      stats::rnbinom(state_tab[i],
                     size = alpha_est,
                     prob = beta_est[i] / (1 + beta_est[i]))
    }))
    sim_output1 <- c(sim_output1, sim_output)
  }
  sim_output1 <- sim_output1[2:length(sim_output1)]
  dens_data1 <- table(factor(data, levels = 0:max(c(data, sim_output1)))) / sum(table(factor(data, levels = 0:max(c(data, sim_output1))))) 
  dens_sim1 <- table(factor(sim_output1, levels = 0:max(c(data, sim_output1)))) / sum(table(factor(sim_output1, levels = 0:max(c(data, sim_output1)))))
  
  kl_plot_f <- function() {
    vcd::rootogram(as.numeric(dens_data1), 
                   as.numeric(dens_sim1),
                   lines_gp = grid::gpar(col = "red", lwd = 1), 
                   main = if (show_titles) "Model Fit" else NULL,
                   points_gp = grid::gpar(col = "black"), 
                   pch = "",
                   ylab = "Frequency (sqrt)",
                   xlab = "Number of Occurrences")
  }
  
  dens_df <- as.data.frame(cbind(rep("observed", length(data)), data))
  
  names(dens_df) <- c("data_type", "value")
  dens_df$value <- as.numeric(dens_df$value)
  
  klplot <- ggplot2::ggplot(dens_df, ggplot2::aes_string(x = "value")) +
    ggplot2::geom_histogram(bins = floor(dim(table(factor(data, levels = 0:max(data))))), 
                            fill = "grey", position = "identity") +
    ggplot2::scale_color_manual(values = c("black")) +
    ggplot2::geom_vline(xintercept = x$estimates$means, color = "black", size = 0.2) +
    ggplot2::labs(x = "Number of Occurences", 
                  y = "Frequency",
                  title = if (show_titles) "Observed Counts and Inferred Means" else NULL) 
  
  if (simulation) {
    klplot <- ggplot2::ggplot(dens_df, ggplot2::aes_string(x = "value")) +
      ggplot2::geom_histogram(bins = floor(dim(table(factor(data, levels = 0:max(data))))), 
                              fill = "grey", position = "identity") +
      ggplot2::scale_color_manual(values = c("black")) +
      ggplot2::geom_vline(xintercept = x$estimates$means, color = "black", size = 0.6) +
      ggplot2::geom_vline(xintercept = true_alpha / true_betas, color = "blue", 
                          size = 0.4, linetype = "dotted") + 
      ggplot2::labs(x = "Number of Occurences", 
                    y = "Frequency",
                    title = if (show_titles) "Observed Counts and Inferred (and True) Means" else NULL)
    
  }
  
  if (simulation) {
    plotlist <- list(mtrace, mdens, Ttrace, Tdens, sdtrace, sddens, metrace, 
                     medens, llplot, conf_mat_plot, klplot)
  } 
  
  if (isFALSE(simulation)) {
    plotlist <- list(mtrace, mdens, Ttrace, Tdens, sdtrace, sddens, metrace, 
                     medens, llplot, statesplot, klplot)
  } 
  
  for (ii in 1:length(plotlist)) {
    
    if (simulation && ii == (length(plotlist) - 2)) {
      gridExtra::grid.arrange(statesplot, statesplot2, nrow = 2)
    }
    invisible(utils::capture.output(print(plotlist[[ii]])))
  }
  kl_plot_f()   
}
Any scripts or data that you put into this service are public.
oHMMed documentation built on May 29, 2024, 2:20 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
oHMMed
HMMs with Ordered Hidden States and Emission Densities

R/MCMC_poisson.R
In oHMMed: HMMs with Ordered Hidden States and Emission Densities

Documented in coef.hmm_mcmc_gamma_poisson hmm_mcmc_gamma_poisson hmm_simulate_gamma_poisson_data kullback_leibler_disc plot.hmm_mcmc_gamma_poisson posterior_prob_gamma_poisson

Try the oHMMed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oHMMed HMMs with Ordered Hidden States and Emission Densities

R/MCMC_poisson.R In oHMMed: HMMs with Ordered Hidden States and Emission Densities

Documented in coef.hmm_mcmc_gamma_poisson hmm_mcmc_gamma_poisson hmm_simulate_gamma_poisson_data kullback_leibler_disc plot.hmm_mcmc_gamma_poisson posterior_prob_gamma_poisson

Try the oHMMed package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oHMMed
HMMs with Ordered Hidden States and Emission Densities

R/MCMC_poisson.R
In oHMMed: HMMs with Ordered Hidden States and Emission Densities