R/likelihood.R

Defines functions prob_trans cluster_like_smear like_sampled_cond loglike_all_clusts loglike_cluster_summary

Documented in cluster_like_smear like_sampled_cond loglike_all_clusts loglike_cluster_summary prob_trans

## Passes tests but doesn't agree with faster, loglike


#' Calculate the log likelihood for data
#'
#' @param par vector of parameters "beta_0" and "beta_1"
#' @param cluster_summary output from \code{summarize_clusters()}
#' @param sampled_trees draws from \code{simulate_many_cond_bp}
#' @param return_neg  If TRUE, return the negative log like.
#' @param n_trials number of trials for each "average"
#' @return likelihood summing over the 'average' of cluster with given n and n_pos (single number), weighted by the data
loglike_cluster_summary <- function(par,
                                    cluster_summary,
                                    sampled_trees,
                                    return_neg = FALSE,
                                    n_trials = 10){

  loglike <- rep(0, nrow(cluster_summary))
  for(ii in 1:nrow(cluster_summary)){
    n <- cluster_summary$n[ii]
    n_pos <- cluster_summary$n_pos[ii]
    n_vec <- rep(n, each = n_trials)
    n_pos_vec <- rep(n_pos, each = n_trials)
    sub_sampled_trees <- sampled_trees %>%
      dplyr::filter(.data$cluster_size == n, .data$cluster_pos == n_pos)
    loglike[ii] <- loglike_all_clusts(par = par,
                                      n_vec = n_vec,
                                      n_pos_vec = n_pos_vec,
                                      sampled_trees = sub_sampled_trees,
                                      return_neg = return_neg)

  }

  return(sum(cluster_summary$freq * loglike))

}





#' Calculate the log likelihood for data
#'
#' @param par vector of parameters "beta_0" and "beta_1"
#' @param n number in cluster
#' @param n_pos number of positive smears
#' @param sampled_trees output from \code{simulate_many_cond_bp} where all n and n_pos are the same for each cluster
#' @param return_neg  If TRUE, return the negative log like.
#' @return likelihood of 'average' of cluster with given n and n_pos (single number)
loglike_all_clusts <- function(par,
                            n_vec,
                            n_pos_vec,
                            sampled_trees,
                            return_neg = FALSE){

  tab <- as.data.frame(table(n_vec, n_pos_vec))
  tab <- tab[tab$Freq != 0,]
  tab$id <- 1:nrow(tab)
  tab$like <- NA

  ## TODO: dplyrify or something
  for(ii in 1:nrow(tab)){
    n <- n_vec[ii]
    n_pos <- n_pos_vec[ii]
    trees_sub <- sampled_trees %>% dplyr::filter(.data$cluster_size == n,
                                                 .data$cluster_pos == n_pos)

    tab$like[ii] <- like_sampled_cond(par = par,
                                      sampled_trees = trees_sub,
                                      n = n,
                                      n_pos = n_pos)
  }

  if(any(is.na(tab$like) | is.nan(tab$like) | is.infinite(tab$like))) warning("NA/NAN/Inf likelihood")
  total_loglike <- sum(tab$Freq * log(tab$like), na.rm = TRUE)
  if(return_neg) total_loglike <- -total_loglike
  return(total_loglike)



}


#' Calculate the likelihood for a cluster of size n and n_pos
#'
#' @param par vector of parameters "beta_0" and "beta_1"
#' @param sampled_trees output from \code{simulate_many_cond_bp} where all n and n_pos are the same for each cluster (cluster_size and cluster_pos)
#' @param n number in cluster
#' @param n_pos number of positive smears
#' @return likelihood of 'average' of cluster with given n and n_pos (single number)
like_sampled_cond <- function(par,
                              sampled_trees,
                              n, n_pos){




  ## Add up the like
  likes <- sampled_trees %>% dplyr::group_by(cluster_id) %>%
    tidyr::nest() %>%
    dplyr::mutate(like = purrr::map(.data$data,
                                    cluster_like_smear,
                                    par = par,
                                    n = n,
                                    n_pos = n_pos) ) %>%
    dplyr::select(-.data$data) %>%
    tidyr::unnest(cols = c(.data$like))

  tot_like <- sum(likes$like)

  return(tot_like)

  ## plyr version is more readable but alas
  ##
  # test <- plyr::ddply(.data = trees_df, .fun = cluster_like_smear,
  #                     .variables = "cluster_id",
  #                     n = n,
  #                     par = par,
  #                     n_pos = n_pos)


}


#' Calculate the likelihood for a cluster of this size
#'
#' @param par vector of parameters "beta_0" and "beta_1"
#' @param cluster_df data frame with the following columns
#' \describe{
#' \item{n_inf}{number this person infected}
#' \item{smear}{-1 or +1}
#' }
#' @param n number in cluster
#' @param n_pos number of positive smears
#' @return likelihood of cluster (single number)

cluster_like_smear <- function(par, cluster_df,
                               n,
                               n_pos){
  p_plus <-  1 / (1 + exp(- (par["beta_0"] + par["beta_1"]) ))
  p_minus <- 1 / (1 + exp(- (par["beta_0"]) ))
  n_inf_plus <- sum(cluster_df$n_inf[cluster_df$smear > 0])
  n_inf_minus <- sum(cluster_df$n_inf[cluster_df$smear < 0])

  like <- (1- p_plus)^n_pos *
    (1-p_minus)^(n - n_pos) *
    p_plus^(n_inf_plus) *
    p_minus^(n_inf_minus)
  names(like) <- NULL

  return(like)



}


#' Get the probability of transferring infection based on smear
#'
#' @param par vector of parameters "beta_0" and "beta_1"
#' @param smear (-1/+1)
#' return(probability between 0 and 1)
prob_trans <- function(par, smear){
  smear <- (smear + 1) / 2
  1 / (1 + exp(- (par["beta_0"] + par["beta_1"] * smear) ))
}
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.