TBornotTB: Analyze Tuberculosis Patients in Maryland

Documented in format_to_outsider move_up_gen simulate_outbreak

## SKG
## March 12, 2020
##
## Generate clusters based on an outsider
## Basically we will relabel the no-outsider simulations
## All generations go down by one number
## Will take out the index person
## update i accordingly




#' Simulate the process of flipping until failure for K clusters
#'
#' @param K number of total clusters to simulate
#' @param inf_params vector with beta_0 and beta_1
#' @param smear_pos_prob probability of a smear positsive
#' @param max_size maximum size a cluster can be
#' @param start_at_outsider if TRUE, we start at an unobserved outsider instead of a single index case
#' @param keep_zero_clusters Do I keep track of the the cluster consisting only of the unobserved root node?  Default is FALSE.
#' @param sample_covariates used for multi-variate people
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{unique cluster ID}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{- 1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' \item{cluster_size}{number in cluster}
#' \item{censored}{whether the cluster end was censored or not}
#' }
#' @details breadth not depth.  Generate generation by generation as opposed to going up the branch til termination.
#' @export
#' @examples
#' inf_params <- c("beta_0" = -2, "beta_1" = 1)
#' smear_pos_prob <- .8
#' max_size <- 30
#' K <- 5
#' out <- simulate_outbreak(K = K,
#'                                 inf_params = inf_params,
#'                                 smear_pos_prob = smear_pos_prob,
#'                                 max_size = max_size)
simulate_outbreak <- function(K,
                              inf_params,
                              smear_pos_prob,
                              max_size = 50,
                              start_at_outsider = FALSE,
                              keep_zero_clusters = FALSE,
                              sample_covariates = NULL){

  clusters <- simulate_flip_til_failure(K = K,
                                        inf_params = inf_params,
                                        smear_pos_prob = smear_pos_prob,
                                        max_size = max_size)

  if(start_at_outsider){
      clusters <- format_to_outsider(clusters,
                                     keep_zero_clusters)
  }
  return(clusters)

}







#' Make the root node an outsider and adjust accordingly
#'
#' @param clusters data frame with the following columns
#' @param keep_zero_clusters Do I keep track of the the cluster consisting only of the unobserved root node?  Default is FALSE.
#' \describe{
#' \item{cluster_id}{unique cluster ID}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{- 1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' \item{cluster_size}{number in cluster}
#' \item{censored}{whether the cluster end was censored or not}
#' }
format_to_outsider <- function(clusters,
                               keep_zero_clusters = FALSE){
  ## change generations
  clusters <- clusters %>%
    dplyr::mutate(gen = .data$gen - 1,
                  cluster_size = .data$cluster_size - 1) %>%
    dplyr::mutate(person_id = move_up_gen(.data$person_id)) %>%
    dplyr::mutate(inf_id = move_up_gen(.data$inf_id)) %>%
    dplyr::mutate(inf_id = ifelse(.data$gen == 1, NA,
                  .data$inf_id))

     
    if(keep_zero_clusters){
        clusters <- clusters %>%
                dplyr::filter(.data$cluster_size == 0 | .data$gen !=0) 
    } else {
        clusters <- clusters %>%
                dplyr::filter(.data$gen !=0) 
    }
    clusters <- clusters %>%
        dplyr::group_by(.data$cluster_id) %>%
        dplyr::mutate(cluster_pos = sum(.data$smear > 0, na.rm = TRUE))
  return(clusters)


}

#' Change ID by subtracting one to the generation
#'
#' @param id format like "CX-GY-NZ"
#' @return id now like "CX-G(Y-1)-NZ"
move_up_gen <- function(id){
  gen <- as.numeric(gsub(".*-G(.+)-N.*", "\\1", id))
  new_id <- sapply(1:length(id), function(x) {
    y <- id[x]
    g <- gen[x] - 1
    gsub("-G(.+)-N", paste0("-G", g,
                                    "-N"), y)})
  return(new_id)

}