R/simulate-flip-til-failure.R

Defines functions generation_infection flip_til_failure simulate_flip_til_failure summarize_clusters

Documented in flip_til_failure generation_infection simulate_flip_til_failure summarize_clusters

## Flip 'til failure
## 2/3/2020
## simulations


#' Summarize the clusters
#'
#' @param clusters_df a data frame with
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{unique cluster ID}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{- 1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' \item{cluster_size}{number in cluster}
#' \item{censored}{whether the cluster end was censored or not}
#' }
#' @return data frame with
#' \describe{
#' \item{n}{number in cluster}
#' \item{n_pos}{number of positive smears in cluster}
#' \item{freq}{frequency this occurs}
#' }
summarize_clusters <- function(clusters_df){
  df1 <- clusters_df %>% dplyr::mutate(id = paste(.data$cluster_size,
                                                  .data$cluster_pos, sep = "-")) %>%
    dplyr::group_by(.data$cluster_id) %>% dplyr::summarize(id = .data$id[1],
                                                   n = .data$cluster_size[1],
                                                   n_pos = .data$cluster_pos[1])
  df2 <- df1 %>% dplyr::group_by(.data$id) %>%
    dplyr::summarize(freq = dplyr::n(),
                     n = .data$n[1],
                     n_pos = .data$n_pos[1]) %>%
    dplyr::select(-.data$id)

  return(df2)



}

#' Simulate the process of flipping until failure for K clusters
#'
#' @param K number of total clusters to simulate
#' @param inf_params vector with beta_0 and beta_1
#' @param smear_pos_prob probability of a smear positsive
#' @param max_size maximum size a cluster can be
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{unique cluster ID}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{- 1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' \item{cluster_size}{number in cluster}
#' \item{censored}{whether the cluster end was censored or not}
#' }
#' @details breadth not depth.  Generate generation by generation as opposed to going up the branch til termination.
#' @export
simulate_flip_til_failure <- function(K,
                                      inf_params,
                                      smear_pos_prob,
                                      max_size = 30){
  cluster_list <- vector(mode = "list", length = K)
  for(ii in 1:K){
    cluster_list[[ii]] <- flip_til_failure(cluster_id = ii,
                                          inf_params = inf_params,
                                          smear_pos_prob = smear_pos_prob,
                                          max_size = max_size)
  }

  df <- dplyr::bind_rows(cluster_list)
  return(df)

}



#' Simulate the process of flipping until failure for a single cluster
#'
#' @param cluster_id unique ID of the cluster
#' @param inf_params vector with beta_0 and beta_1
#' @param smear_pos_prob probability of a smear positsive
#' @param max_size maximum size a cluster can be
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{unique cluster ID}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{- 1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' \item{cluster_size}{number in cluster}
#' \item{censored}{whether the cluster end was censored or not}
#' }
#' @details breadth not depth.  Generate generation by generation as opposed to going up the branch til termination.
flip_til_failure <- function(cluster_id,
                             inf_params,
                             smear_pos_prob,
                             max_size = 30){

  df <- data.frame(cluster_id = rep(cluster_id, max_size),
                   person_id = NA,
                   smear = NA,
                   gen = NA,
                   inf_id = NA,
                   n_inf = NA,
                   cluster_size = 1,
                   cluster_pos = NA,
                   censored = FALSE)

  censored <- FALSE

  p_pos <- 1 / (1 + exp(- inf_params["beta_0"] - inf_params["beta_1"]))
  p_neg <- 1 / (1 + exp(- inf_params["beta_0"]))

  cluster_size <- 1
  smear_pos_prob <- ifelse(smear_pos_prob == 1, .999999999, # v hacky
                           ifelse(smear_pos_prob == 0, .00000000001,
                                  smear_pos_prob))
  gen_list <- vector(mode = "list", length = max_size)
  gen_list[[1]] <- data.frame(cluster_id = cluster_id,
                                      person_id = paste0("C", cluster_id, "-G1-N1"),
                                      smear = sample(0:1, size = 1,
                                                     prob = c(1- smear_pos_prob,
                                                              smear_pos_prob)),
                                      gen = 1,
                                      inf_id = NA,
                                      n_inf = NA,
                                      cluster_size = 1,
                                      cluster_pos = NA,
                              stringsAsFactors = FALSE)
  for(gg in 2:(max_size - 1)){
    next_gen_output <- generation_infection(cluster_id = cluster_id,
                                            p_pos = p_pos,
                                            p_neg = p_neg,
                                            smear_pos_prob = smear_pos_prob,
                                            gen = gg,
                                            prev_gen = gen_list[[gg-1]],
                                            cluster_size = cluster_size,
                                            max_size = max_size)
    if(cluster_size == next_gen_output$new_cluster_size){ # no new generations
      break
    }
    cluster_size <- next_gen_output$new_cluster_size
    gen_list[[gg]] <- next_gen_output$cur_gen
    gen_list[[gg-1]]$n_inf <- next_gen_output$n_inf
    if(max_size <= cluster_size){ # Hit the max size
      censored <- TRUE
      break
    }



  }
  sim_df <- dplyr::bind_rows(gen_list)
  sim_df$cluster_pos <- sum(sim_df$smear > 0)
  sim_df$cluster_size <- nrow(sim_df)
  sim_df$censored <- censored
  sim_df$cluster_size <- cluster_size
  sim_df$smear <- ifelse(sim_df$smear == 0, -1, sim_df$smear) # easy translation
  return(sim_df)

}



#' Infect the current generation given the previous generation
#'
#' @param cluster_id unique cluster ID
#' @param p_pos probability of being infected given a smear positive
#' @param p_neg probability of being infected given a smear negative
#' @param smear_pos_prob probability of being smear positive
#' @param gen new generation number
#' @param prev_gen data frame with at least smear status (0/1) and person_id
#' @param cluster_size size of previous cluster
#' @param max_size maximum size of cluster
#' @return list with the following entries
#' \describe{
#' \item{new_cluster_size}{new cluster size with the current generation}
#' \item{cur_gen}{data frame of current generation (new infections)}
#' \item{n_inf}{number of infections for each person in the previous generation}
#' }
generation_infection <- function(cluster_id,
                                 p_pos,
                                 p_neg,
                                 smear_pos_prob,
                                 gen,
                                 prev_gen,
                                 cluster_size,
                                 max_size){


  n_new_inf <- rep(0, nrow(prev_gen))
  smear <- prev_gen$smear
  p <- ifelse(smear == 1, p_pos, p_neg)
  p <- ifelse(p == 1, .99999999, p) # pretty hacky
  for(ii in 1:nrow(prev_gen)){
    n_new_inf[ii] <- rgeom(n = 1, prob = 1 - p[ii])
    new_cluster_size <- cluster_size +
      sum(n_new_inf[1:ii])
    if(new_cluster_size >= max_size){
    #  browser()
      if(ii > 1){
        n_new_inf[ii] <- (max_size -  cluster_size - sum(n_new_inf[1:(ii-1)]))
      } else {
        n_new_inf[ii] <- max_size - cluster_size
      }
      break  # Have found enough
    }
  }
#  browser()
  n <- sum(n_new_inf)
  if(n == 0){ # No new infections in next generations
    return(list(new_cluster_size = cluster_size,
                cur_gen = NULL,
                n_inf = n_new_inf))
  }
  new_cluster_size <- n + cluster_size


  cur_gen <- data.frame(cluster_id = rep(cluster_id, n),
             person_id = paste0("C", cluster_id, "-G", gen, "-N", 1:n),
             smear = sample(0:1, size = n, replace = TRUE,
                            prob = c(1- smear_pos_prob,
                                     smear_pos_prob)),
             gen = gen,
             inf_id = rep(prev_gen$person_id, times = n_new_inf),
             n_inf = NA,
             cluster_size = NA,
             cluster_pos = NA,
             stringsAsFactors = FALSE)

  return(list(new_cluster_size = n + cluster_size,
         cur_gen = cur_gen,
         n_inf = n_new_inf))

}
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.