TBornotTB: Analyze Tuberculosis Patients in Maryland

Documented in impute_root impute_root_inner

## SKG
## March 27, 2020
##
## We need to properly compute the likelihood when the root node is unobserved.  There are a few ways to do this but I think the easiest (given current software) may be to impute the root covariate and then find the likelihood of the imputed tree.  That is for cluster size n, number of positives x, and frequency f, and scaling factor B
## (n, x, f) -> (n + 1, x + 1, B1), (n + 1, x, B2) where B1 + B2 = Bf
## and B1 ~ Binomial(B, p_x) where p_x is the probability of a positive smear.


#' Impute the root node for an observed cluster
#'
#' @param data data frame with the following columns
#' \describe{
#' \item{n}{size of cluster}
#' \item{n_pos}{number of positive smears}
#' \item{freq}{frequency of cluster of size n, positive smear x}
#' }
#' @param B scaling factor of data imputation.  Default is 1.
#' @param prob_pos probability of imputed root being positive smear.
#' @return data frame with the following columns.
#' \describe{
#' \item{obs_n}{number in original observed cluster}
#' \item{obs_n_pos}{number positive in original observed cluster}
#' \item{n}{number in imputed cluster (obs_n + 1)}
#' \item{n_pos}{number in imputed cluster, either obs_n_pos or obs_n_pos + 1}
#' \item{B}{number of times this occurred}
#' }
#' @examples
#' data <- data.frame(n = c(1, 1, 5), n_pos = c(1, 0, 3), freq = c(10, 2, 4))
#' out <- impute_root(data)
#' head(out)
impute_root <- function(data,
                        B = 1,
                        prob_pos = .5){
    
    new_data <- data  %>%
        dplyr::mutate(obs_n = .data$n, obs_n_pos = .data$n_pos)  %>%
        dplyr::group_by(.data$obs_n, .data$obs_n_pos) %>%
        tidyr::nest() %>%
        dplyr::mutate(root = purrr::map(.data$data,
                                        n = .data$obs_n,
                                        n_pos = .data$obs_n_pos,
                                        impute_root_inner,
                                        B = B,
                                        prob_pos = prob_pos)) %>%
        dplyr::select(-.data$data) %>%
        tidyr::unnest(cols = root)
        
    return(new_data)


}



#' Impute the root node for an observed cluster (inner function)
#'
#' @param data 
#' \describe{
#' \item{n}{cluster size}
#' \item{n_pos}{number of positive smears in cluster}
#' @param n cluster size
#' @param n_pos number of positive
#' @param B scaling factor of data imputation.  Default is 1.
#' @param prob_pos probability of imputed root being positive smear.
#' @return
#' \describe{
#' \item{obs_n}{number in original observed cluster}
#' \item{obs_n_pos}{number positive in original observed cluster}
#' \item{n}{number in imputed cluster (obs_n + 1)}
#' \item{n_pos}{number in imputed cluster, either obs_n_pos or obs_n_pos + 1}
#' \item{B}{number of times this occurred}
#' }
#' @examples
#' data <- data.frame(n = 1, n_pos = 1, freq = 10
#' out <- impute_root_inner(data)
#' head(out)
impute_root_inner <- function(data,
                              n,
                              n_pos,
                              B = 1,
                              prob_pos = .5){

    f1 <- rbinom(n = 1,
                 size = B,
                 prob = prob_pos)
    f2 <- B - f1
    df <- data.frame(n = data$n + 1,
                     n_pos = c(data$n_pos + 1, data$n_pos),
                     B = c(f1, f2))
    df <- df %>% dplyr::filter(df$B > 0)
    return(df)


}