hgzips: hgzips

Documented in grid_HZINB HZINB_one_gamma parRangeCheck

#' HGZIPS - HZINB (not assuming independence)
#'
#' This \code{HZINB_one_gamma} function finds hyperparameter estimates by implementing the Expectation-Maximization (EM) algorithm and hierarchical zero-inflated negative binomial model with one gamma component.
#'
#' @name HZINB_one_gamma
#' @import emdbook
#' @import stats
#' @import pscl
#' @param grid_a alpha value grid
#' @param grid_b beta value grid
#' @param grid_omega omega value grid
#' @param init_pi_klh initial probability value of all the alpha, beta, omega combinations for implementing the EM algprithm
#' @param dataset a list of squashed/unsquashed datasets that include N_ij, E_ij and weights for each drug (j). This dataset list can be generated by the rawProcessing function in this package.
#' @param iteration number of EM algorithm iterations to run
#' @param N_ij matrix of N_ij values, i - AE, j - drugs
#' @param E_ij matrix of E_ij values, corresponding to N_ij
#' @param Loglik whether to return the loglikelihood of each iteration or not (TRUE or FALSE)
#' @param a_j alpha for gamma distribution
#' @param b_j beta for gamma distribution
#' @param omega_j proportion
#' @param K number of a_j in grid
#' @param L number of b_j in grid
#' @param H number of omega_j in grid
#' @param zeroes A logical scalar specifying if zero counts should be included.
#' @param N_star the minimum Nij count size to be used for hyperparameter estimation. If zeroes are included in Nij vector, please set N_star = NULL
#'
#'
#'
##########################################################
## HZINB: check the range of parameters
##########################################################

# input N = N_ij, E = E_ij

#' @rdname HZINB_one_gamma
#' @return \code{parRangeCheck} the estimated a_j, b_j and omega_j for each drug (j)
#' parRangeCheck
#' @export
parRangeCheck = function(N_ij, E_ij){

#  if (!require('countreg')) install.packages('countreg'); library('countreg')

  a_j = rep(NA, ncol(N_ij))
  b_j = rep(NA, ncol(N_ij))
  omega_j = rep(NA, ncol(N_ij))

  for (j in 1:ncol(N_ij)){
    data = as.data.frame(cbind(N_ij[,j], rep(1, nrow(N_ij))))
    colnames(data) = c("N_ij", "X")
    tryCatch({
      model = pscl::zeroinfl(N_ij ~ 1, data = data, dist = "negbin", offset = log(E_ij[,j]))
      a_j[j] = model$theta
      omega_j[j] = exp(coef(model)[2])/(1 + exp(coef(model)[2]))
      b_j[j] = a_j[j]/exp(model$coefficients[1]$count)

    }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
  }

  result = list("a_j" = a_j, "b_j" = b_j, "omega_j" = omega_j)
  return(result)

}



##########################################################
## HZINB: build a parameter grid
##########################################################


#' @rdname HZINB_one_gamma
#' @return \code{grid_HZINB} build a suitable grid of a_j, b_j, and omega_j for implementing HZINB
#' grid_HZINB
#' @export
#'
grid_HZINB = function(a_j, b_j, omega_j, K, L, H){

  grid = as.data.frame(matrix(NA, max(c(K, L, H)), 3))
  colnames(grid) = c("a_j", "b_j", "omega_j")

  for (i in c(1:H)){
    grid[i,3] = exp(log(omega_j[which.min(omega_j)]) + (i)/(K + 1)*(log(omega_j[which.max(omega_j)])  - log(omega_j[which.min(omega_j)])))
  }

  for (i in c(1:K)){
    grid[i,1] = exp(log(a_j[which.min(a_j)]) + (i - 1)/(K + 1)*(log(quantile(a_j, 0.99, names = FALSE, na.rm = TRUE)) - log(a_j[which.min(a_j)])))
  }

  for (i in c(1:L)){
    grid[i,2] = exp(log(b_j[which.min(b_j)]) + (i - 1)/(K + 1)*(log(quantile(b_j, 0.99, names = FALSE, na.rm = TRUE)) - log(b_j[which.min(b_j)])))
  }

  return(grid)

}


#####################################
## Hierarchical ZINB (one gamma)
#####################################
# +-x +-x +-x +-x +-x +-x +-x +-x
#  Not assuming independence
# +-x +-x +-x +-x +-x +-x +-x +-x

#' @rdname HZINB_one_gamma
#' @return \code{HZINB_one_gamma} a list of estimated probability of each alpha, beta, omega combination and their corresponding loglikelihood (optional)
#' \itemize{
#' \item \code{theta_EM} Estimate of hyperparameters for each EM iteration
#' \item \code{llh} logliklihood for each EM iteration (optional)
#' }
#' HZINB_one_gamma
#' @export
HZINB_one_gamma = function(grid_a, grid_b, grid_omega, init_pi_klh, dataset, iteration, Loglik = FALSE, zeroes = FALSE, N_star = 1){

  for (k in 1:length(dataset)){
    if (!is.null(N_star)){
      dataset[[k]] = subset(dataset[[k]], N >= N_star)
    }
  }

  if (zeroes == FALSE){
    K = length(grid_a)
    L = length(grid_b)

    #  if (!require('countreg')) install.packages('countreg'); library('countreg')

    all_combinations = as.data.frame(matrix(NA, K*L, 2))
    colnames(all_combinations) = c("a_j", "b_j")
    all_combinations$a_j = rep(grid_a, 10)

    for (i in c(1:L)){
      all_combinations$b_j[((i - 1)*10 + 1):(i*10)] = rep(grid_b[i], 10)
    }

    ## EM algorithm

    # initialization
    N.EM <- iteration  # number of E-M iterations
    #iteration_50 = pi_klh[50,]

    pi_klh = matrix(NA, N.EM + 1, K*L)
    pi_klh[1,] = init_pi_klh

    denominator = rep(NA, length(dataset))
    numerator = rep(NA, length(dataset))
    #ratio = rep(NA, ncol(N_ij))
    joint_probs = as.data.frame(matrix(NA, nrow(all_combinations), length(dataset)))


    for (j in 1:length(dataset)){
      for (m in 1:nrow(all_combinations)){
        joint_probs[m,j] = sum(dataset[[j]]$weight * (dnbinom(dataset[[j]]$N, size = all_combinations$a_j[m], prob = all_combinations$b_j[m]/(dataset[[j]]$E + all_combinations$b_j[m]), log = TRUE) - log1p(-pnbinom(N_star - 1, size = all_combinations$a_j[m], prob = all_combinations$b_j[m]/(dataset[[j]]$E + all_combinations$b_j[m]), log = TRUE))))
      }
    }

    LSE_R <- function(vec){
      n.vec <- length(vec)
      vec <- sort(vec, decreasing = TRUE)
      Lk <- vec[1]
      for (k in 1:(n.vec-1)) {
        Lk <- max(vec[k+1], Lk) + log1p(exp(-abs(vec[k+1] - Lk)))
      }
      return(Lk)
    }

    ratio = as.data.frame(matrix(NA, K*L, length(dataset)))

    for (i in c(1 : N.EM)) {
      print(i)
      for (m in 1:nrow(all_combinations)){
        for (j in 1:length(dataset)){
          ratio[m,j] = log(pi_klh[i, m]) + joint_probs[m,j] - LSE_R((log(pi_klh[i,]) + joint_probs[,j])[which((log(pi_klh[i,]) + joint_probs[,j]) != "NaN")])
        }
      }

      all = cbind(all_combinations, ratio)
      all$sum = rowSums(exp(ratio))
      overallSum = sum(all$sum, na.rm = TRUE)

      pi_klh[i + 1, ] = (all$sum)/overallSum
    }

    result = list("pi_klh" = pi_klh, "abomega_combinations" = all_combinations)

  } else {

    K = length(grid_a)
    L = length(grid_b)
    H = length(grid_omega)

    #  if (!require('countreg')) install.packages('countreg'); library('countreg')

    all_combinations = as.data.frame(matrix(NA, K*L*H, 3))
    colnames(all_combinations) = c("a_j", "b_j", "omega_j")
    all_combinations$a_j = rep(grid_a, 100)

    for (i in c(1:L)){
      all_combinations$b_j[((i - 1)*100 + 1):(i*100)] = rep(grid_b[i], 100)
    }

    for (i in c(1:H)){
      row_num = c(((i - 1)*10 + 1):(i*10))
      all_combinations$omega_j[row_num] = rep(grid_omega[i], 10)
    }

    all_combinations$omega_j = rep(all_combinations$omega_j[1:100], 10)

    ## EM algorithm

    # initialization
    N.EM <- iteration  # number of E-M iterations
    #iteration_50 = pi_klh[50,]

    pi_klh = matrix(NA, N.EM + 1, K*L*H)
    pi_klh[1,] = init_pi_klh

    denominator = rep(NA, length(dataset))
    numerator = rep(NA, length(dataset))
    #ratio = rep(NA, ncol(N_ij))
    joint_probs = as.data.frame(matrix(NA, nrow(all_combinations), length(dataset)))

    for (j in 1:length(dataset)){
      for (m in 1:nrow(all_combinations)){
        joint_probs[m,j] = sum(dataset[[j]]$weight * emdbook::dzinbinom(dataset[[j]]$N, mu = (dataset[[j]]$E/all_combinations$b_j[m])*all_combinations$a_j[m], size = all_combinations$a_j[m], zprob = all_combinations$omega_j[m], log = TRUE))
      }
    }

    LSE_R <- function(vec){
      n.vec <- length(vec)
      vec <- sort(vec, decreasing = TRUE)
      Lk <- vec[1]
      for (k in 1:(n.vec-1)) {
        Lk <- max(vec[k+1], Lk) + log1p(exp(-abs(vec[k+1] - Lk)))
      }
      return(Lk)
    }

    ratio = as.data.frame(matrix(NA, K*H*L, length(dataset)))

    for (i in c(1 : N.EM)) {
      print(i)
      for (m in 1:nrow(all_combinations)){
        for (j in 1:length(dataset)){
          ratio[m,j] = log(pi_klh[i, m]) + joint_probs[m,j] - LSE_R((log(pi_klh[i,]) + joint_probs[,j])[which((log(pi_klh[i,]) + joint_probs[,j]) != "NaN")])
        }
      }

      all = cbind(all_combinations, ratio)
      all$sum = rowSums(exp(ratio))
      overallSum = sum(all$sum, na.rm = TRUE)

      pi_klh[i + 1, ] = (all$sum)/overallSum
    }

    if (Loglik == TRUE){

      llh_j = rep(NA, length(dataset))
      llh = rep(NA, N.EM)

      for (i in 1:N.EM){
        for (j in 1:length(dataset)){
          llh_j[j] = unlist(LSE_R(log(pi_klh[i,]) + joint_probs[,j]))
        }
        llh[i] = sum(llh_j)
      }

    } else {
      llh = NULL
    }

    result = list("pi_klh" = pi_klh, "abomega_combinations" = all_combinations, "loglik" = llh)
  }
  return(result)

}