## SKG
## Likelihood calcs
## But like faster
## Now even better(?)
## 3/10/2010
#' Calculate the log likelihood of a cluster conditioned on size and smear pos. num
#'
#' @param par vector where the first parameter is "beta_0" and the second is "beta_1"
#' @param data data frame with columns
#' \describe{
#' \item{n}{number in cluster}
#' \item{n_pos}{number of positive smears in cluster}
#' \item{freq}{number of times this frequency occurs}
#' }
#' @param sampled_trees_summary data frame with the following columns
#' @param is_truncated logical. Default is FALSE
#' @param return_neg logical. Should I return the negative log likelihood?
#' \describe{
#' \item{i_pos}{number of transmission from a smear positive person}
#' \item{n_pos}{number of positive smears}
#' \item{n}{cluster size}
#' \item{freq}{time it occurs}
#' }
#' @param cond_on_generator logical indicating whether we compute the likelihood conditioned on an outside generator or not. See details
#' @return log likelihood summed over the clusters
#' @export
#' @details TODO
loglike_fast <- function(par,
data,
sampled_trees_summary,
is_truncated = FALSE,
return_neg = FALSE,
cond_on_generator = FALSE){
loglike_fxn <- ifelse(cond_on_generator, loglike_doubly_trunc,
loglike_cond_fast)
p_plus <- as.numeric(1 / (1 + exp(-par[1] - par[2])))
p_neg <- as.numeric(1 / (1 + exp(-par[1])))
w_pos <- sum(data$n_pos * data$freq) / sum(data$n * data$freq)
data <- data %>% ## Rename to observed n to avoid downward confusion
dplyr::rename(obs_n = .data$n,
obs_n_pos = .data$n_pos)
## Backwards compatibility
if(!("obs_n" %in% colnames(sampled_trees_summary))){
sampled_trees_summary <- sampled_trees_summary %>%
dplyr::mutate(obs_n = .data$n,
obs_n_pos = .data$n_pos)
}
df <- sampled_trees_summary %>%
dplyr::group_by(.data$obs_n,
.data$obs_n_pos) %>%
tidyr::nest() %>%
dplyr::mutate(loglike = purrr::map(.data$data,
loglike_fxn,
p_plus = p_plus,
p_neg = p_neg,
is_truncated = is_truncated,
w_pos = w_pos) ) %>%
dplyr::select(-.data$data) %>%
tidyr::unnest(cols = loglike)
df_join <- dplyr::left_join(df, data, by = c("obs_n", "obs_n_pos"))
loglike <- sum(df_join$freq * df_join$loglike)
if(return_neg){
loglike <- -loglike
}
# print(loglike)
return(loglike)
}
#' Calculate the log likelihood of a cluster conditioned on size and smear pos. num
#'
#' @param p_plus probability of smear pos
#' @param p_neg probability of smear neg
#' @param clust_df data frame with the following columns
#' @param is_truncated logical default is FALSE
#' @param w_pos empirical weights of number of positive smears
#' \describe{
#' \item{i_pos}{number of transmission from a smear positive person}
#' \item{i_neg}{number of transmission from a smear negative person (optional)}
#' \item{n_pos}{number of positive smears}
#' \item{n}{cluster size}
#' \item{freq}{frequency of time this particular cluster occurs}
#' }
#' @details if i_neg is not a column in clust_df we will use n-i_pos - 1 as i_neg. If 'is_truncated' is TRUE, we will calculate the the zero-truncated likelihood
#' @return log likelihood summed over the clusters
loglike_cond_fast <- function(p_plus, p_neg,
clust_df,
is_truncated = FALSE,
w_pos = .5){
if(!("i_neg" %in% colnames(clust_df))){
clust_df$i_neg <- clust_df$n[1] - 1 - clust_df$i_pos
}
B <- sum(clust_df$freq)
n <- clust_df$n[1]
n_pos <- clust_df$n_pos[1]
out <- with(clust_df, (p_plus)^i_pos *
(p_neg)^(i_neg) *
(1 - p_plus)^n_pos *
(1 - p_neg)^(n - n_pos) *
freq)
if(any(is.na(out))) stop("Error in calculating likelihood. NAs present")
if(is_truncated){
out <- out / (( (w_pos) * (1-p_plus) + (1 - w_pos) * (1-p_neg)))
}
return(log(sum(out) / B))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.