R/tree-sampler-with-outside-gen.R

Defines functions tree_sampler_nx tree_sampler

## SKG
## Latest tree sampler
## Includes outside capabilities
## April 2, 2020

#' Sample trees
#'
#' @param data data frame with the following columns
#' \describe{
#' \item{n}{number of \emph{observed} individuals in cluster}
#' \item{n_pos}{number of \emph{observed} positive smears in cluster}
#' \item{freq}{numbr of times (n,n_pos) was observed}
#' }
#' @param B number of trees to sample for each (n,n_pos) pair
#' @param impute_generator logical indicating whether we need to 
#' impute an outside generator with its own smear status.  Default is \code{FALSE}.
#' @param prob_pos probability of a positive smear individual
#' @return data frame with the following columns
#' \describe{
#' \item{obs_n}{total \emph{observed} size}
#' \item{obs_n_pos}{number of \emph{observed} smear pos.}
#' \item{n}{number of actual individuals in cluster (with generator)}
#' \item{n_pos}{number of actual individuals in cluster (with generator)}
#' \item{i_pos}{number of i-positive transmissions (with generator)}
#' \item{i_neg}{number of i-negative transmissions}
#' \item{x0}{smear status of generator}
#' \item{n0}{number infected by generator}
#' \item{freq}{frequency of (n,n_pos,i_pos, i_neg, x0,n0)}
#' }
#' @export
#' @details We uniformly generate trees corresponding to a given (size, # positive) pair.  
#' If we need to impute a generator, then we actually generate trees of size (obs_size + 1, new_pos) 
#' where the smear status of the the generator is 
tree_sampler <- function(data, B,
                         impute_generator = FALSE,
                         prob_pos = .5,
                         summarize_generator = FALSE){
    
    ## If we need to impute a generator, we do so here
    if(impute_generator){
        data <- impute_root(data, B = B, prob_pos = prob_pos)
    } else{  ## Else we rename the variables, that is obs_n = n
        data$B <- B
        data <- data %>%
            dplyr::mutate(obs_n = .data$n,
                          obs_n_pos = .data$n_pos)
    }
    
    sampled_trees_summary <- data %>%
        dplyr::group_by(.data$obs_n, .data$obs_n_pos,
                        .data$n, .data$n_pos) %>%
        tidyr::nest() %>%
        dplyr::mutate( sum = purrr::map(.data$data,
                                        tree_sampler_nx,
                                        n = .data$n,
                                        x = .data$n_pos,
                                        summarize_generator =
                                            summarize_generator)) %>%
        dplyr::select(-.data$data) %>%
        tidyr::unnest(cols = sum) %>%
        dplyr::select(-.data$size, -.data$x)
    
    return(sampled_trees_summary)
    
    
}

#' Sample uniform trees
#'
#' @param n total size
#' @param x total number of smear pos
#' @param summarize_generator Do we have summarize the generator infections?  Default is FALSE
#' @return data frame with the following columns
#' \describe{
#' \item{size}{total size}
#' \item{x}{number of smear pos}
#' \item{i_pos}{number of i-positive transmissions}
#' \item{i_neg}{number of i-negative transmissions}
#' \item{freq}{frequency of (n,x,i)}
#' }
#' @export
#' @details assumes one single generator
tree_sampler_nx <- function(data, n, x,
                            summarize_generator = FALSE){
  
  ## If size 1, have trivial samples
  if(n == 1){
    df <- data.frame(size = n,
                     x = x,
                     i_pos = 0,
                     i_neg = 0,
                     freq = data$B,
                     stringsAsFactors = FALSE)
    return(df)
  }
  B <- data$B[1]
  ## Sample generation sizes
  # only have 1 generation if there is only one person, otherwise there are at least 2
  g_vec <- 2 + rbinom(n = B, size = n-2, prob = .5) # number of generations where first generation is fixed
  
  g_tab <- table(g_vec)
  unique_g <- sort(unique(g_vec))
  i_list <- vector(mode = "list", length = length(unique_g))
  
  for(ii in 1:length(unique_g)){
    ## Sample unique permutations of generation sizes given g
    g <- unique_g[ii]
    perm_mat <- sample_unique_perms(g = g, n = n,
                                    B = as.numeric(g_tab[ii]))
      i_list[[ii]] <- sample_in0x0(perm_mat, x,
                                   summarize_generator)  # summarize tree with i_pos, i_neg, (n0, x0)
    ## 
  }
  i_mat <- do.call('rbind', i_list)
  if(summarize_generator){
      df <- data.frame(table(i_mat[,1], i_mat[,2], i_mat[,3], i_mat[,4]))
      colnames(df) <- c("i_pos", "i_neg", "n0", "x0", "freq")
      df$size <- n
      df$x <- x
      df$i_pos <- as.numeric(as.character(df$i_pos))
      df$i_neg <- as.numeric(as.character(df$i_neg))
      df$n0 <- as.numeric(as.character(df$n0))
      df$x0 <- as.numeric(as.character(df$x0))
      df <- df %>%
          dplyr::filter(.data$freq > 0) %>%
          dplyr::select(.data$size, .data$x, 
                        .data$i_pos, .data$i_neg, 
                        .data$n0, .data$x0,
                        .data$freq)
  } else {
      df <- data.frame(table(i_mat[,1], i_mat[,2]))
      colnames(df) <- c("i_pos", "i_neg",  "freq")
      df$size <- n
      df$x <- x
      df$i_pos <- as.numeric(as.character(df$i_pos))
      df$i_neg <- as.numeric(as.character(df$i_neg))
      df <- df %>%
          dplyr::filter(.data$freq > 0) %>%
          dplyr::select(.data$size, .data$x, 
                        .data$i_pos, .data$i_neg,
                        .data$freq)
  }

  
  return(df)
  
  
}

## Tomorrow's plan
## test the tree sampler
## Can pull a lot of tests from sample_trees fast
## test outer function
## Can I replicate one root results?
## Can I get multiple root results?
## do I need x0/n0 or can I get away with not knowing?  I think I don't have to know
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.