R/likelihood-outside.R

Defines functions like_sampled_trees prob_x0 prob_n0 like_inside_cond like_outside_nx loglike_outside_gen

## SKG
## APRIL 9, 2020
## Holy Thursday
## Trying the outside log like AGAIN
## maybe this time it will work


#' Get the log likelihood estimate for outside generated data
#'
#' @param par vector of beta0, beta1
#' @param data data frame with the following columns
#' \describe{
#' \item{obs_n}{number in cluster}
#' \item{obs_n_pos}{number of positive smears in cluster}
#' \item{freq}{number of times this frequency occurs}
#' }
#' @param sampled_trees data frame with the following columns
#' \describe{
#' \item{i_pos}{number of transmission from a smear positive person}
#' \item{i_neg}{number of transmission from a smear negative person}
#' \item{n_pos}{number of positive smears}
#' \item{n}{cluster size}
#' \item{freq}{time it occurs}
#' }
#' @param B number of samples to draw
#' @param return_neg logical indicating whether we should return negative log likelihood.  Default TRUE
#' @param w_pos probability of a positive smear.  Default is .5
#' @param sampled_vars_list pre computed list of sampling helpers.  Default is NULL meaning we compute within the function.
#' @return log likelihood summed over the clusters
#' @export
#' @details TODO
loglike_outside_gen <- function(par, data,
                                sampled_trees,
                                B,
                                return_neg = TRUE,
                                w_pos = .5,
                                sampled_vars_list = NULL){
    p_plus <- 1 / (1 + exp(-(sum(par) )))
    p_neg <- 1 / (1 + exp(-par[1]))


    sampled_trees_like <- like_sampled_trees(p_plus, p_neg,
                                        sampled_trees) # df with (n, n_pos, avg_like)

    df <- data %>%
        dplyr::mutate(obs_n = .data$n,
                      obs_n_pos = .data$n_pos) %>%
        dplyr::group_by(.data$obs_n,
                        .data$obs_n_pos,
                        .data$freq) %>%
        tidyr::nest() %>%
        dplyr::mutate(like = purrr::map(.data$data,
                                           like_outside_nx,
                                           sampled_trees = sampled_trees_like,
                                           p_plus = p_plus,
                                           p_neg = p_neg,
                                           w_pos = w_pos,
                                           B = B,
                                        sampled_vars_list = sampled_vars_list) ) %>%
        dplyr::select(-.data$data) %>%
        tidyr::unnest(cols = like)  %>%
        dplyr::mutate(loglike = log(.data$like))


 


<<<<<<< HEAD
    loglike <- sum(df$freq * df$loglike) 
=======
    loglike <- sum(df$freq * df$loglike)
>>>>>>> 7e1a77c693066842666d17ce2eedcb06dc5d9643
    if(return_neg){
        loglike <- -loglike
    }
    ## print(loglike)
    return(loglike)
    

}


#' Calculate the conditional likelihood for outside generator clusters
#'
#' @param data data frame with the following columns
#' \describe{
#' \item{n}{cluster size}
#' \item{n_pos}{number of positive smears}
#' }
#' @param sampled_trees data frame with the following columns
#' \describe{
#' \item{n}{cluster size}
#' \item{n_pos}{number of positive smears}
#' \item{avg_like}{like for this set}
#' }
#' @param p_plus probability of tranmsmitting disease from a positive smear
#' @param p_neg probability of transmistting disease from negative smear
#' @param w_pos probability of a positive smear
#' @param B number of samples
#' @param sampled_vars_list pre computed list of sampling helpers.  
#' Default is NULL meaning we compute within the function.
#' @return average likelihood contribution from cluster of size n with n_pos positive smears
#' @details TODO
like_outside_nx <- function(data,
                            sampled_trees,
                            p_plus, p_neg,
                            w_pos, B,
                            sampled_vars_list = NULL){


    n <- data$n
    n_pos <- data$n_pos
    stopifnot(length(n) == 1)


    if(is.null(sampled_vars_list)){
    ## Sample the new things we need:  (x0, n0, nprime, xprime)
        sampled_vars_list <- sample_outside_gen_vars(n, n_pos, B)
    } else {
        sampled_vars_list <- sampled_vars_list[[paste0(n, "-", n_pos)]]
    }


    ## Calculate the average *likelihood* through recursion and the previous sampled trees
    like <- sapply(sampled_vars_list, like_inside_cond,
                   sampled_trees_calc = sampled_trees,
                   p_plus = p_plus, p_neg = p_neg,
                   w_pos = w_pos)
    return(mean(like))

}


#' Calculate the likelihood, summing over different sub-trees
#'
#' @param sampled_vars_list list with the following entries
#' \describe{
#' \item{n_prime}{vector of non-empty partitions}
#' \item{n_pos_prime}{vector of possibly empty partitions}
#' \item{n0}{number outside generator infects}
#' \item{x0}{smear of outside generator}
#' }
#' @param sampled_trees_calc data frame with the following columns
#' \describe{
#' \item{n}{number of nodes in sampled tree}
#' \item{n_pos}{number of positive smears}
#' \item{avg_like}{pre-calculated likelihood for given tree}
#' }
#' @param p_plus probability of positive transmission
#' @param p_neg probability of a negative transmission
#' @param w_pos probablity of smear positive
#' @return a single number, the likelihood for the possibly multi-root tree
like_inside_cond <- function(sampled_vars_list,
                             sampled_trees_calc,
                             p_plus, p_neg,
                             w_pos){
    	n_vec <- sampled_vars_list$n_prime
        n_pos_vec <- sampled_vars_list$n_pos_prime
        x0 <- sampled_vars_list$x0
        n0 <- sampled_vars_list$n0
        if(length(n_vec) != length(n_pos_vec)) browser()
      ##  inner_df <- data.frame(n = n_vec,
        ##                       n_pos = n_pos_vec)
      ##  joined_df <- dplyr::left_join(inner_df, sampled_trees_calc,
       ##                               by = c("n", "n_pos"))
        avg_like <- sapply(1:length(n_vec), function(ii){
            sampled_trees_calc$avg_like[sampled_trees_calc$n == n_vec[ii] &
                                            sampled_trees_calc$n_pos == n_pos_vec[ii]]
        })

        n <- sum(n_vec)
        
        ## probabilities
        p_x0 <- prob_x0(x0, w_pos)
        p_n0 <- prob_n0(p_plus, p_neg, x0, n0, n)
        # #p_like1 <- joined_df$avg_like
        p_like1 <- avg_like
        like <- sum(p_x0 * p_n0 * p_like1)
        if(is.na(like)) browser()
        return(like)

}


#' Get the probability of infecting n0 people
#'
#' @param p_plus probability of infection if positive smear
#' @param p_neg probability of infection if negative smear
#' @param x0 0/1 -/+ smear for generator
#' @param n0 number of observed infections by generator
#' @param n maximum possible number of infections by generator
prob_n0 <- function(p_plus, p_neg, x0, n0, n){
    p0 <- ifelse(x0 == 1, p_plus, p_neg)
    denom <- sum(p0^(1:n))
    num <- p0^n0
    return(num/denom)
                 
}



#' Return probability of given smear
#'
#' @param x0 observed smear (0/1 = -/+)
#' @param w_pos probability of a smear
prob_x0 <- function(x0, w_pos){
    return(ifelse(x0 == 0, 1 - w_pos,
                  w_pos))
}


#' Calculate the likelihood for a single conditional tree
#'
#' @param p_plus probability of positive transmission
#' @param p_neg probability of negative transmission
#' @param sampled_trees data frame with the following columns
#' \describe{
#' \item{n}{number in cluster}
#' \item{n_pos}{number of postive smears in cluster}
#' \item{i_pos}{number of positive transmission}
#' \item{i_neg}{number of negative transmission}
#' \item{freq}{times this combination occurred}
#' }
#' @return data frame with average likelihood for clusters of size n, n_pos smears
like_sampled_trees <- function(p_plus, p_neg, sampled_trees){
    sampled_trees$p_plus <-  p_plus
    sampled_trees$p_neg <-  p_neg
    sampled_trees <- sampled_trees %>%
        dplyr::group_by(.data$n, .data$n_pos) %>%  # will this work who knows
        dplyr::summarize(avg_like = (
            sum((1- .data$p_plus)^.data$n_pos *
            (1- .data$p_neg)^(.data$n-.data$n_pos) *
            .data$p_plus^(.data$i_pos) *
            .data$p_neg^(.data$i_neg) *
            .data$freq)) / 
                sum(.data$freq)
        )
	
	return(sampled_trees)
	
	
}

## NEXT
## SIMULATE TABLES UP to (n,x) n = 1, .., 5, x = 0, ..., 5 and save
## Then try this out PLEASE WORKPatri
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.