## Generation | covariates
##
#' Wrapper for simulate_cond_bp
#'
#' @param K number of clusters to generate
#' @param n_vec number of people in each cluster
#' @param n_pos_vec number of positive smears in each cluster
#' @return data frame with the following columns
#' @param part_list a list of unique partitions with
#' @param one_init logical. If TRUE, we assume the tree grew from one person
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' \describe{
#' \item{cluster_id}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{-1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' }
#' @export
simulate_many_cond_bp <- function(K, n_vec, n_pos_vec,
part_list = NULL,
g_weight_list = NULL,
one_init = FALSE){
if(any(n_pos_vec > n_vec)){
stop("You have more positive smears than people in the cluster!")
}
ll <- vector(mode = "list", length = K)
for(ii in 1:K){
ll[[ii]] <- simulate_cond_bp(k = ii, n = n_vec[ii],
n_pos = n_pos_vec[ii],
part_list = part_list,
one_init = one_init,
g_weight_list = g_weight_list)
}
df <- dplyr::bind_rows(ll)
return(df)
}
#' Simulate a single branching process conditioned on the given data
#'
#' @param k cluster ID
#' @param n number of people in the cluster
#' @param n_pos number of positive smears in the cluster
#' @param part_list a list of unique partitions with
#' @param one_init logical. If TRUE, we assume the tree grew from one person
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}
#' \item{person_id}{order of infection in the cluster}
#' \item{smear}{-1 (negative) / + 1 (positive)}
#' \item{gen}{generation number (>=0)}
#' \item{inf_id}{ID of the infector}
#' \item{n_inf}{number of people infected by person}
#' \item{cluster_pos}{number of positive smears in the cluster}
#' }
#' @export
simulate_cond_bp <- function(k, n, n_pos,
part_list = NULL,
one_init = FALSE,
g_weight_list = NULL){
if(is.null(g_weight_list) |
is.null(part_list)) stop("generate a valid weight list and part_list")
part <- draw_part_from_list(n, part_list,
g_weight_list, one_init)
cluster <- part_to_cluster(k, part)
smear <- assign_smear(n, n_pos)
# if(length(smear) != nrow(cluster)) browser()
cluster$smear <- smear
cluster$cluster_pos <- n_pos
return(cluster)
}
#' Draw partition from pre-computed list
#'
#' @param n number of total people in cluster
#' @param part_list pre computed list of partitions (see details)
#' @param g_weight_list a summary of part_list, gives the number of times generations of size g appear in a cluster of size n
#' @return a matrix of dimension g x 1
#' @details \code{part_list} is a list of lists. The first list is named by the total number in the cluster, "n{x}" where x is the number in that cluster. The second list is "g{y}" where y is the number of generations in that partition. Each entry in the second list is a matrix with number rows of size y. Each column is a unique partition and so all columns will sum to n.
draw_part_from_list <- function(n, part_list = NULL,
g_weight_list = NULL,
one_init = FALSE){
if(n == 1) return(matrix(1, ncol = 1))
if(one_init) n <- n-1 # if one root, we will add at end
sub_part_list <- part_list[[paste0("n", n)]]
wts <- g_weight_list[[n]]
n_gen <- sample(1:n, prob = wts / sum(wts), size = 1)
sub_parts <- sub_part_list[[paste0("g", n_gen)]]
sampled_ind <- sample(1:ncol(sub_parts), size = 1)
sampled_part <- sub_parts[, sampled_ind]
## if forcing one person init, add to beginning
if(one_init) sampled_part <- c(1, sampled_part)
sampled_part <- matrix(sampled_part,
ncol = 1)
return(sampled_part)
}
#' Draw the number of generations for a fixed cluster size
#'
#' @param n the cluster size
#' @param lambda the mean number in a Poisson distribution to draw from
#' @return a number between 1 and n, which is the number of generations
draw_n_gen <- function(n, lambda = 3){
n_gen <- rpois(1, lambda) + 1
if(n_gen > n) n_gen <- n
return(n_gen)
}
#' Draw the number in each generation g
#'
#' @param n the number in the cluster
#' @param g the number of (non-zero) generations
#' @return a matrix of dimension g x 1
draw_part <- function(n, g){
parts <- partitions::restrictedparts(n = n, m = g,
include.zero = FALSE)
if(ncol(parts) == 1){
sampled_part <- matrix(parts[,1], ncol = 1)
} else {
unique_perm_parts <- matrix(unlist(apply(parts, 2, unique_perm)),
nrow = g)
sampled_part_ind <- sample(1:ncol(unique_perm_parts), size = 1)
sampled_part <- unique_perm_parts[, sampled_part_ind, drop = FALSE]
}
return(sampled_part)
}
#' Take the partitions by generations and
#' turn it into a full transmission path
#'
#' @param k cluster ID
#' @param part matrix of dimension g x 1, the number in each generation
#' @return data frame with the following columns
#' \describe{
#' \item{cluster_id}{cluster_id}
#' \item{gen}{generation number (1 to n)}
#' \item{gen_n}{number in generation}
#' \item{id}{C{x}-G{y}-N{z}, cluster x, generation y, number z makes up the unique ID}
#' \item{cluster_size}{number in cluster k (n)}
#' \item{infector_id}{who infected this person}
#' \item{n_inf}{number of infections caused by this person}
#' }
part_to_cluster <- function(k, part){
n <- sum(part)
g <- nrow(part)
cluster <- data.frame(cluster_id = rep(k, n),
gen = 1, gen_n = 1,
cluster_size = n,
infector_id = NA)
cluster$gen <- rep(1:g, times = part)
cluster <- cluster %>% dplyr::group_by(.data$gen) %>% dplyr::mutate(gen_n = dplyr::row_number())
cluster$id <- paste0("C", k, "-G", cluster$gen, "-N", cluster$gen_n)
cluster$infector_id <- assign_infector(cluster$gen, part, k)
cluster$n_inf <- count_infections(cluster)
return(cluster)
}
#' Assign an infector ID to someone in previous generation
#'
#' @param gen_vec vector of generation numbers
#' @param part, matrix of partitions g x 1
#' @param k cluster ID
#' @return a vector of length n, which gives the id of the person who infected this one or NA if person is in first generation
assign_infector <- function(gen_vec, part, k){
infector_id_temp <- sapply(gen_vec, function(g){
if(g == 1) return(NA)
n_in_gen <- part[g-1, 1]
sample(x = 1:n_in_gen, size = 1)
})
infector_id <- ifelse(is.na(infector_id_temp), NA,
paste0("C", k, "-G", gen_vec - 1, "-N", infector_id_temp))
return(infector_id)
}
#' Count the number of infections each person gives
#'
#' @param df data frame with the following columns
#' \describe{
#' \item{id}{c{x}-g{y}-n{z}, cluster x, generation y, number z makes up the unique ID}
#' \item{infector_id}{c{x}-g{y}-n{z}, cluster x, generation y, number z makes up the unique ID}
#' }
#' @return counts corresponding to each id.
#' @details the count is the number of people the person with the ID infected, i.e. the number of times their id appears in infector_id column (can be zero)
count_infections <- function(df){
fac_var <- factor(df$infector_id, levels = df$id)
counts <- table(fac_var)
counts <- as.numeric(counts)
names(counts) <- NULL
return(counts)
}
#' Assign a fixed number of smears to the group
#'
#' @param n number of people in group
#' @param n_pos number of positive smears
#' @return vector of size n of exactly n_pos 1s and n - n_pos -1
assign_smear <- function(n, n_pos){
if(n == n_pos){
smear <- rep(1, n)
} else if(n_pos == 0){
smear <- rep(-1, n)
} else {
smear <- c(rep(1, n_pos), rep(-1, n - n_pos))
}
return(sample(smear))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.