TBornotTB: Analyze Tuberculosis Patients in Maryland

Documented in loglike_cond_fast loglike_fast

## SKG
## Likelihood calcs
## But like faster
## Now even better(?)
## 3/10/2010



#' Calculate the log likelihood of a cluster conditioned on size and smear pos. num
#'
#' @param par vector where the first parameter is "beta_0" and the second is "beta_1"
#' @param data data frame with columns
#' \describe{
#' \item{n}{number in cluster}
#' \item{n_pos}{number of positive smears in cluster}
#' \item{freq}{number of times this frequency occurs}
#' }
#' @param sampled_trees_summary data frame with the following columns
#' @param is_truncated logical.  Default is FALSE
#' @param return_neg logical.  Should I return the negative log likelihood?
#' \describe{
#' \item{i_pos}{number of transmission from a smear positive person}
#' \item{n_pos}{number of positive smears}
#' \item{n}{cluster size}
#' \item{freq}{time it occurs}
#' }
#' @param cond_on_generator logical indicating whether we compute the likelihood conditioned on an outside generator or not.  See details
#' @return log likelihood summed over the clusters
#' @export
#' @details TODO
loglike_fast <- function(par,
                         data,
                         sampled_trees_summary,
                         is_truncated = FALSE,
                         return_neg = FALSE,
                         cond_on_generator = FALSE){
  
  loglike_fxn <- ifelse(cond_on_generator, loglike_doubly_trunc,
                        loglike_cond_fast)


    p_plus <- as.numeric(1 / (1 + exp(-par[1] - par[2])))
    p_neg <- as.numeric(1 / (1 + exp(-par[1])))
    w_pos <- sum(data$n_pos * data$freq) / sum(data$n * data$freq)

    data <- data %>%  ## Rename to observed n to avoid downward confusion
        dplyr::rename(obs_n = .data$n,
                      obs_n_pos = .data$n_pos)

   
    
    ## Backwards compatibility
    if(!("obs_n" %in% colnames(sampled_trees_summary))){
        sampled_trees_summary <- sampled_trees_summary %>%
            dplyr::mutate(obs_n = .data$n,
                          obs_n_pos = .data$n_pos)
    }

    df <- sampled_trees_summary %>%
        dplyr::group_by(.data$obs_n,
                      .data$obs_n_pos) %>%
        tidyr::nest() %>%
        dplyr::mutate(loglike = purrr::map(.data$data,
                                           loglike_fxn,
                                           p_plus = p_plus,
                                           p_neg = p_neg,
                                           is_truncated = is_truncated,
                                           w_pos = w_pos) ) %>%
        dplyr::select(-.data$data) %>%
        tidyr::unnest(cols = loglike)  
    



  df_join <- dplyr::left_join(df, data, by = c("obs_n", "obs_n_pos"))


  loglike <- sum(df_join$freq * df_join$loglike)
  if(return_neg){
    loglike <- -loglike
  }
 # print(loglike)
  return(loglike)


}


#' Calculate the log likelihood of a cluster conditioned on size and smear pos. num
#'
#' @param p_plus probability of smear pos
#' @param p_neg probability of smear neg
#' @param clust_df data frame with the following columns
#' @param is_truncated logical default is FALSE
#' @param w_pos empirical weights of number of positive smears
#' \describe{
#' \item{i_pos}{number of transmission from a smear positive person}
#' \item{i_neg}{number of transmission from a smear negative person (optional)}
#' \item{n_pos}{number of positive smears}
#' \item{n}{cluster size}
#' \item{freq}{frequency of time this particular cluster occurs}
#' }
#' @details if i_neg is not a column in clust_df we will use n-i_pos - 1 as i_neg.  If 'is_truncated' is TRUE, we will calculate the the zero-truncated likelihood
#' @return log likelihood summed over the clusters
loglike_cond_fast  <- function(p_plus, p_neg,
                               clust_df,
                               is_truncated = FALSE,
                               w_pos = .5){

    if(!("i_neg" %in% colnames(clust_df))){
        clust_df$i_neg <- clust_df$n[1] - 1 - clust_df$i_pos
    }
    B <- sum(clust_df$freq)
    n <- clust_df$n[1]
    n_pos <- clust_df$n_pos[1]
    out <- with(clust_df, (p_plus)^i_pos *
                          (p_neg)^(i_neg) *
                          (1 - p_plus)^n_pos *
                          (1 - p_neg)^(n - n_pos) *
                freq)
    if(any(is.na(out))) stop("Error in calculating likelihood.  NAs present")
    if(is_truncated){
        out <- out / (( (w_pos) * (1-p_plus) + (1 - w_pos) * (1-p_neg)))
    }
    
    return(log(sum(out) / B))
}