## SKG
## Latest tree sampler
## Includes outside capabilities
## April 2, 2020
#' Sample trees
#'
#' @param data data frame with the following columns
#' \describe{
#' \item{n}{number of \emph{observed} individuals in cluster}
#' \item{n_pos}{number of \emph{observed} positive smears in cluster}
#' \item{freq}{numbr of times (n,n_pos) was observed}
#' }
#' @param B number of trees to sample for each (n,n_pos) pair
#' @param impute_generator logical indicating whether we need to
#' impute an outside generator with its own smear status. Default is \code{FALSE}.
#' @param prob_pos probability of a positive smear individual
#' @return data frame with the following columns
#' \describe{
#' \item{obs_n}{total \emph{observed} size}
#' \item{obs_n_pos}{number of \emph{observed} smear pos.}
#' \item{n}{number of actual individuals in cluster (with generator)}
#' \item{n_pos}{number of actual individuals in cluster (with generator)}
#' \item{i_pos}{number of i-positive transmissions (with generator)}
#' \item{i_neg}{number of i-negative transmissions}
#' \item{x0}{smear status of generator}
#' \item{n0}{number infected by generator}
#' \item{freq}{frequency of (n,n_pos,i_pos, i_neg, x0,n0)}
#' }
#' @export
#' @details We uniformly generate trees corresponding to a given (size, # positive) pair.
#' If we need to impute a generator, then we actually generate trees of size (obs_size + 1, new_pos)
#' where the smear status of the the generator is
tree_sampler <- function(data, B,
impute_generator = FALSE,
prob_pos = .5,
summarize_generator = FALSE){
## If we need to impute a generator, we do so here
if(impute_generator){
data <- impute_root(data, B = B, prob_pos = prob_pos)
} else{ ## Else we rename the variables, that is obs_n = n
data$B <- B
data <- data %>%
dplyr::mutate(obs_n = .data$n,
obs_n_pos = .data$n_pos)
}
sampled_trees_summary <- data %>%
dplyr::group_by(.data$obs_n, .data$obs_n_pos,
.data$n, .data$n_pos) %>%
tidyr::nest() %>%
dplyr::mutate( sum = purrr::map(.data$data,
tree_sampler_nx,
n = .data$n,
x = .data$n_pos,
summarize_generator =
summarize_generator)) %>%
dplyr::select(-.data$data) %>%
tidyr::unnest(cols = sum) %>%
dplyr::select(-.data$size, -.data$x)
return(sampled_trees_summary)
}
#' Sample uniform trees
#'
#' @param n total size
#' @param x total number of smear pos
#' @param summarize_generator Do we have summarize the generator infections? Default is FALSE
#' @return data frame with the following columns
#' \describe{
#' \item{size}{total size}
#' \item{x}{number of smear pos}
#' \item{i_pos}{number of i-positive transmissions}
#' \item{i_neg}{number of i-negative transmissions}
#' \item{freq}{frequency of (n,x,i)}
#' }
#' @export
#' @details assumes one single generator
tree_sampler_nx <- function(data, n, x,
summarize_generator = FALSE){
## If size 1, have trivial samples
if(n == 1){
df <- data.frame(size = n,
x = x,
i_pos = 0,
i_neg = 0,
freq = data$B,
stringsAsFactors = FALSE)
return(df)
}
B <- data$B[1]
## Sample generation sizes
# only have 1 generation if there is only one person, otherwise there are at least 2
g_vec <- 2 + rbinom(n = B, size = n-2, prob = .5) # number of generations where first generation is fixed
g_tab <- table(g_vec)
unique_g <- sort(unique(g_vec))
i_list <- vector(mode = "list", length = length(unique_g))
for(ii in 1:length(unique_g)){
## Sample unique permutations of generation sizes given g
g <- unique_g[ii]
perm_mat <- sample_unique_perms(g = g, n = n,
B = as.numeric(g_tab[ii]))
i_list[[ii]] <- sample_in0x0(perm_mat, x,
summarize_generator) # summarize tree with i_pos, i_neg, (n0, x0)
##
}
i_mat <- do.call('rbind', i_list)
if(summarize_generator){
df <- data.frame(table(i_mat[,1], i_mat[,2], i_mat[,3], i_mat[,4]))
colnames(df) <- c("i_pos", "i_neg", "n0", "x0", "freq")
df$size <- n
df$x <- x
df$i_pos <- as.numeric(as.character(df$i_pos))
df$i_neg <- as.numeric(as.character(df$i_neg))
df$n0 <- as.numeric(as.character(df$n0))
df$x0 <- as.numeric(as.character(df$x0))
df <- df %>%
dplyr::filter(.data$freq > 0) %>%
dplyr::select(.data$size, .data$x,
.data$i_pos, .data$i_neg,
.data$n0, .data$x0,
.data$freq)
} else {
df <- data.frame(table(i_mat[,1], i_mat[,2]))
colnames(df) <- c("i_pos", "i_neg", "freq")
df$size <- n
df$x <- x
df$i_pos <- as.numeric(as.character(df$i_pos))
df$i_neg <- as.numeric(as.character(df$i_neg))
df <- df %>%
dplyr::filter(.data$freq > 0) %>%
dplyr::select(.data$size, .data$x,
.data$i_pos, .data$i_neg,
.data$freq)
}
return(df)
}
## Tomorrow's plan
## test the tree sampler
## Can pull a lot of tests from sample_trees fast
## test outer function
## Can I replicate one root results?
## Can I get multiple root results?
## do I need x0/n0 or can I get away with not knowing? I think I don't have to know
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.