R/simulate-conditional-bp.R

Defines functions assign_smear count_infections assign_infector part_to_cluster draw_part draw_n_gen draw_part_from_list simulate_cond_bp simulate_many_cond_bp

Documented in assign_infector assign_smear count_infections draw_n_gen draw_part draw_part_from_list part_to_cluster simulate_cond_bp simulate_many_cond_bp

## Generation | covariates
##




#' Wrapper for simulate_cond_bp
#'
#' @param K number of clusters to generate
#' @param n_vec number of people in each cluster
#' @param n_pos_vec number of positive smears in each cluster
#' @return data frame with the following columns
#' @param part_list a list of unique partitions with
#' @param one_init logical.  If TRUE, we assume the tree grew from one person
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' \describe{
#' \item{cluster_id}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{-1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' }
#' @export
simulate_many_cond_bp <- function(K, n_vec, n_pos_vec,
                                  part_list =  NULL,
                                  g_weight_list = NULL,
                                  one_init = FALSE){
  if(any(n_pos_vec > n_vec)){
    stop("You have more positive smears than people in the cluster!")
  }
  ll <- vector(mode = "list", length = K)
  for(ii in 1:K){
    ll[[ii]] <- simulate_cond_bp(k = ii, n = n_vec[ii],
                                 n_pos = n_pos_vec[ii],
                                 part_list = part_list,
                                 one_init = one_init,
                                 g_weight_list = g_weight_list)
  }

  df <- dplyr::bind_rows(ll)
  return(df)
}


#' Simulate a single branching process conditioned on the given data
#'
#' @param k cluster ID
#' @param n number of people in the cluster
#' @param n_pos number of positive smears in the cluster
#' @param part_list a list of unique partitions with
#' @param one_init logical.  If TRUE, we assume the tree grew from one person
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{-1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' }
#' @export
simulate_cond_bp <- function(k, n, n_pos,
                             part_list = NULL,
                             one_init = FALSE,
                             g_weight_list = NULL){
  if(is.null(g_weight_list) |
     is.null(part_list)) stop("generate a valid weight list and part_list")
  part <- draw_part_from_list(n, part_list,
                              g_weight_list, one_init)
  cluster <- part_to_cluster(k, part)
  smear <- assign_smear(n, n_pos)
#  if(length(smear) != nrow(cluster)) browser()
  cluster$smear <- smear
  cluster$cluster_pos <- n_pos
  return(cluster)

}

#' Draw partition from pre-computed list
#'
#' @param n number of total people in cluster
#' @param part_list pre computed list of partitions (see details)
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' @return a matrix of dimension g x 1
#' @details \code{part_list} is a list of lists.  The first list is named by the total number in the cluster, "n{x}" where x is the number in that cluster.  The second list is "g{y}" where y is the number of generations in that partition.  Each entry in the second list is a matrix with number rows of size y.  Each column is a unique partition and so all columns will sum to n.
draw_part_from_list <- function(n, part_list = NULL,
                                g_weight_list = NULL,
                                one_init = FALSE){

  if(n == 1) return(matrix(1, ncol = 1))
  if(one_init) n <- n-1 # if one root, we will add at end
  sub_part_list <- part_list[[paste0("n", n)]]
  wts <- g_weight_list[[n]]
  n_gen <- sample(1:n, prob = wts / sum(wts), size = 1)
  sub_parts <- sub_part_list[[paste0("g", n_gen)]]
  sampled_ind <- sample(1:ncol(sub_parts), size = 1)
  sampled_part <- sub_parts[, sampled_ind]
  ## if forcing one person init, add to beginning
  if(one_init) sampled_part <- c(1, sampled_part)
  sampled_part <- matrix(sampled_part,
                         ncol = 1)
  return(sampled_part)

}

#' Draw the number of generations for a fixed cluster size
#'
#' @param n the cluster size
#' @param lambda the mean number in a Poisson distribution to draw from
#' @return a number between 1 and n, which is the number of generations
draw_n_gen <- function(n, lambda = 3){
  n_gen <- rpois(1, lambda) + 1
  if(n_gen > n) n_gen <- n
  return(n_gen)

}


#' Draw the number in each generation g
#'
#' @param n the number in the cluster
#' @param g the number of (non-zero) generations
#' @return a matrix of dimension g x 1
draw_part <- function(n, g){
  parts <- partitions::restrictedparts(n = n, m = g,
                                       include.zero = FALSE)
  if(ncol(parts) == 1){
    sampled_part <- matrix(parts[,1], ncol = 1)
  } else {

    unique_perm_parts <- matrix(unlist(apply(parts, 2,  unique_perm)),
                                nrow = g)
    sampled_part_ind <- sample(1:ncol(unique_perm_parts), size = 1)
    sampled_part <- unique_perm_parts[, sampled_part_ind, drop = FALSE]
  }
  return(sampled_part)
}


#' Take the partitions by generations and
#' turn it into a full transmission path
#'
#' @param k cluster ID
#' @param part matrix of dimension g x 1, the number in each generation
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{cluster_id}
#' \item{gen}{generation number (1 to n)}
#' \item{gen_n}{number in generation}
#' \item{id}{C{x}-G{y}-N{z}, cluster x, generation y, number z makes up the unique ID}
#' \item{cluster_size}{number in cluster k (n)}
#' \item{infector_id}{who infected this person}
#' \item{n_inf}{number of infections caused by this person}
#' }
part_to_cluster <- function(k, part){
  n <- sum(part)
  g <- nrow(part)
  cluster <- data.frame(cluster_id = rep(k, n),
                        gen = 1, gen_n = 1,
                        cluster_size = n,
                        infector_id = NA)
  cluster$gen <- rep(1:g, times = part)
  cluster <- cluster %>% dplyr::group_by(.data$gen) %>% dplyr::mutate(gen_n = dplyr::row_number())
  cluster$id <- paste0("C", k, "-G", cluster$gen, "-N", cluster$gen_n)
  cluster$infector_id <- assign_infector(cluster$gen, part, k)
  cluster$n_inf <- count_infections(cluster)

  return(cluster)

}



#' Assign an infector ID to someone in previous generation
#'
#' @param gen_vec vector of generation numbers
#' @param part, matrix of partitions g x 1
#' @param k cluster ID
#' @return a vector of length n, which gives the id of the person who infected this one or NA if person is in first generation
assign_infector <- function(gen_vec, part, k){
  infector_id_temp <- sapply(gen_vec, function(g){
    if(g == 1) return(NA)
    n_in_gen <- part[g-1, 1]
    sample(x = 1:n_in_gen, size = 1)
  })
  infector_id <- ifelse(is.na(infector_id_temp), NA,
                    paste0("C", k, "-G", gen_vec - 1, "-N", infector_id_temp))
  return(infector_id)

}

#' Count the number of infections each person gives
#'
#' @param df data frame with the following columns
#' \describe{
#' \item{id}{c{x}-g{y}-n{z}, cluster x, generation y, number z makes up the unique ID}
#' \item{infector_id}{c{x}-g{y}-n{z}, cluster x, generation y, number z makes up the unique ID}
#' }
#' @return counts corresponding to each id.
#' @details  the count is the number of people the person with the ID infected, i.e. the number of times their id appears in infector_id column (can be zero)
count_infections <- function(df){
    fac_var <- factor(df$infector_id, levels = df$id)
    counts <- table(fac_var)
    counts <- as.numeric(counts)
    names(counts) <- NULL
    return(counts)
}


#' Assign a fixed number of smears to the group
#'
#' @param n number of people in group
#' @param n_pos number of positive smears
#' @return vector of size n of exactly n_pos 1s and n - n_pos -1
assign_smear <- function(n, n_pos){
  if(n == n_pos){
    smear <- rep(1, n)
  } else if(n_pos == 0){
    smear <- rep(-1, n)
  } else {
    smear <- c(rep(1, n_pos), rep(-1, n - n_pos))
  }
  return(sample(smear))
}
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.