R/tsscs.R

Defines functions simulate_tscs_strat simulate_sample_tscs_strat simulate_cum_tscs_strat2 generate_tscs_strat generate_pop_clust_strat generate_pop_clust genenate_clust generate_clust_size rpois_min

Documented in genenate_clust generate_clust_size generate_pop_clust generate_pop_clust_strat generate_tscs_strat rpois_min simulate_cum_tscs_strat2 simulate_sample_tscs_strat simulate_tscs_strat

#' Random generator of Poisson distribution with mininum value
#'
#' @param min minimum allowed for cluster size
#' @param lambda numeric value for expected cluster size for poisson distr
#'
#' @export

rpois_min <- function(min, lambda){

  size <- 0
  while(size < min) size <- rpois(1, lambda)
  return(size)

}


#' Generate cluster size with specified mininum size allowed
#'
#' @param n_cluster numeric value for number of clusters to be generated
#' @param min minimum allowed for cluster size
#' @param lambda numeric value for expected cluster size for poisson distr
#'
#' @export

generate_clust_size <- function(n_cluster,
                                min,
                                lambda){

  size <- replicate(n_cluster, rpois_min(min, lambda))
  return(size)

}


#' Generate one cluster of binormal distributions
#'
#' Generate one cluster containing healthy and diseased observations generated from normal distributions.
#' The correlated binary data is generated using the multivariate probit method.
#' The correlated continuous marker values are generated by including a random effect
#'
#' @param size_clus size of cluster
#' @param corr_mvn numeric value for correlation in multivariate normal distribution
#' @param mean0 numeric vector of mean for class label 0 (healthy) for all strata
#' @param mean1 numeric vector of mean for class label 1 (diseased) for all strata
#' @param sigma0 numeric vector of standard deviation for class label 0 (healthy) for all strata
#' @param sigma1 numeric vector of standard deviation for class label 1 (diseased) for all strata
#' @param p1 numeric value for proportion of diseased observations in the population
#' @tau numeric value for standard deviation of mixed effect of cluster
#'
#' @return Dataframe containing
#' \itemize{
#'   \item cluster_size: cluster size
#'   \item X: continuous marker values
#'   \item D: class labels (0: healthy; 1: diseased)
#' }
#'
#' @export

genenate_clust <- function(size_clus,
                           corr_mvn,
                           mean0,
                           mean1,
                           sigma0,
                           sigma1,
                           p1,
                           tau){

  # Defining covariance matrix
  cov <- diag(rep(1, size_clus))
  cov[upper.tri(cov)] <- corr_mvn
  cov[lower.tri(cov)] <- corr_mvn

  # Generating multivariate normal
  z <- MASS::mvrnorm(n = 1,
                     mu = rep(0, size_clus),
                     Sigma = cov)

  # Generating binary data
  D <- ifelse(z < qnorm(p1), 1, 0)
  clus_effect <- rnorm(n = 1, sd = tau) # random effect of cluster
  X0 <- rnorm(size_clus, mean0, sigma0)
  X1 <- rnorm(size_clus, mean1, sigma1)

  return(data.frame(cluster_size = size_clus,
                    X = (1-D)*X0 +D*X1 + clus_effect,
                    D = D))
}

#' Generate a population of clustered observations
#'
#' @param mean0 numeric value of mean for class label 0 (healthy)
#' @param mean1 numeric value of mean for class label 1 (diseased)
#' @param sigma0 numeric value of standard deviation for class label 0 (healthy)
#' @param sigma1 numeric value of standard deviation for class label 1 (diseased)
#' @param N
#' @param cluster_size
#' @param tau
#' @param p1 numeric value for proportion of diseased observations in the population
#'
#' @tau numeric value for standard deviation of mixed effect of cluster
#' @return Dataframe containing
#' \itemize{
#'   \item cluster_size: cluster size
#'   \item X: continuous marker values
#'   \item D: class labels (0: healthy; 1: diseased)
#' }
#'
#' @export

generate_pop_clust <- function(N,
                               mean0,
                               mean1,
                               sigma0,
                               sigma1,
                               p1,
                               cluster_size,
                               tau) {

  # Generating population
  pop <- generate_pop(N, mean0, mean1, sigma0, sigma1, p1)

  if(length(cluster_size) == 1){ #fixed cluster size

    # Number of clusters
    N_cluster <- N/cluster_size

    # Adding another layer of error
    pop_clus <-
      pop %>%
      mutate(X = X + rnorm(N, sd = tau),
             cluster_id = ntile(X, N_cluster))

  } else{ # varying cluster size

    N_cluster <- length(cluster_size)

    pop_clus <-
      pop %>%
      mutate(X = X + rnorm(N, sd = tau)) %>%
      arrange(X) %>%
      mutate(cluster_id = rep(1:N_cluster, sample(cluster_size)))

  }

  return(pop_clus)
}


#' Title
#'
#' @param N
#' @param freq_strata
#' @param mean0_strat
#' @param mean1_strat
#' @param sigma0_strat
#' @param sigma1_strat
#' @param p1
#' @param cluster_size_strat list of vector of cluster sizes for each strata. cluster sizes should be dimension 1 if cluster sizes are fixed.
#' Should have dimension = number of clusters if cluster sizes are varying
#' @param tau
#'
#' @return
#' @export
#'
#' @examples

generate_pop_clust_strat <- function(N,
                                     freq_strata,
                                     mean0_strat,
                                     mean1_strat,
                                     sigma0_strat,
                                     sigma1_strat,
                                     p1,
                                     cluster_size_strat,
                                     tau) {

  N_strata <- length(freq_strata)
  size_strata <- ceiling(N*freq_strata)

  pop <-
    purrr::pmap_dfr(list(as.list(size_strata),
                     as.list(mean0_strat),
                     as.list(mean1_strat),
                     as.list(sigma0_strat),
                     as.list(sigma1_strat),
                     replicate(N_strata, p1, simplify = FALSE),
                     cluster_size_strat,
                     replicate(N_strata, tau, simplify = FALSE)), # List of parameters for each strata
                .f = generate_pop_clust,
                .id = 'strata') %>% # Generating population for each strata
    mutate(strata = as.numeric(strata)) %>%
    group_by(strata, cluster_id) %>%
    mutate(cluster_size = n()) %>%
    ungroup %>%
    group_by(strata) %>%
    mutate(strata_size = n()) %>%
    ungroup


  return(pop)

}

#' Generate a Stratified Two-Stage Cluster sampling
#'
#' @param population dataframe containing population
#' @param n_psu numeric value for number of psu to be drawn from each strata
#' @param n_ssu numeric value for number of ssu to be drawn from each psu
#'
#' @export

generate_tscs_strat <- function(population,
                                n_psu,
                                n_ssu){
  # Number of strata
  N_strata <- length(table(population$strata))

  # Number of PSU per strata
  N_psu <- tapply(population$cluster_id, population$strata, max)

  N_psu_df = data.frame(strata = 1:N_strata,
                        N_cluster = N_psu)

  # Sampling psu
  sample_psu <-
    population %>%
    group_by(strata) %>%
    summarise(cluster_id = unique(cluster_id)) %>%
    slice_sample(n = n_psu) %>%
    left_join(population, by = c('strata', 'cluster_id')) # bringing information from selected clusters

  # Sampling SSU
  final_sample <-
    sample_psu %>%
    group_by(strata, cluster_id) %>%
    slice_sample(n = n_ssu) %>%
    ungroup %>%
    left_join(N_psu_df, by = 'strata') %>%
    mutate(weight = ifelse(N_cluster < n_psu & cluster_size < n_ssu, 1,
                           ifelse(N_cluster < n_psu & cluster_size >= n_ssu, cluster_size/n_ssu,
                                  ifelse(N_cluster >= n_psu & cluster_size < n_ssu, N_cluster/n_psu, N_cluster*cluster_size/(n_psu*n_ssu)))))


  return(final_sample)


}


#' Title
#'
#' @param pop
#' @param N_psu
#' @param sample_size_psu
#' @param sample_size_ssu
#' @param grid
#'
#' @return
#' @export
#'
#' @examples

# Second attempt
simulate_cum_tscs_strat2 <- function(pop,
                                     sample_size_psu,
                                     sample_size_ssu,
                                     grid = NULL,
                                     var_weight){

  samples <-
    pmap(list(n_psu = as.list(sample_size_psu),
              n_ssu = as.list(sample_size_ssu)),
         .f = generate_tscs_strat,
         population = pop)

  roc <-
    samples %>%
    map(~svydesign(id =~cluster_id, strata =~strata, weights =~weight, data = .x, fpc = ~strata_size, nest = TRUE)) %>%
    map_dfr(estimate_survey_roc,
            grid = grid,
            var_weight = 'weight',
            ci = TRUE,
            .id = 'scenario')


  return(roc)


}


#' Title
#'
#' @param pop
#' @param N_psu
#' @param N_sample
#' @param sample_size_psu
#' @param sample_size_ssu
#' @param grid
#'
#' @return
#' @export
#'
#' @examples

simulate_sample_tscs_strat <- function(pop,
                                       N_sample,
                                       sample_size_psu,
                                       sample_size_ssu,
                                       grid = NULL) {


  # Generating and stacking samples
  sim <-
    replicate(N_sample,
              simulate_cum_tscs_strat2(pop,
                                       sample_size_psu,
                                       sample_size_ssu,
                                       grid = grid), simplify = FALSE) %>%
    bind_rows(.id = 'sample')



  return(sim)


}

#' Simulation function for stratified Two Stage Cluster Sampling
#'
#' Generates stratified populations and draw stratified SRS from each population.
#' Computes survey weighted ROC curve for each sample drawn form each of the genereated populations
#'
#' @param N_pop numeric value for number of stratified population to be generated
#' @param N_psu numeric vector of number os psu's (clusters) per strata
#' @param cluster_size_strat list of cluster sizes per strata
#' @param param_pop data frame containing population parameters (prob, mu0, mu1, sigma0, sigma1)
#' @param prop_disease numeric value for the proportion of diseased in population
#' @param N_sample numeric value for number of samples to be drawn from each population
#' @param sample_size_psu numeric value for number of psu selected per strata
#' @param sample_size_ssu numeric value for number of ssu selected per selected psu
#' @param grid numeric vector of points for roc curve from 0 to 1
#'
#' @return a dataframe containing containing survey weighted point and interval estimates for ROC curve for each
#' sample drawn from each of the generated populations
#'
#' @export

# simulate_tscs_strat <- function(N_pop,
#                                 N,
#                                 param_pop,
#                                 p1,
#                                 cluster_size_strat,
#                                 tau = 1,
#                                 N_sample,
#                                 sample_size_psu,
#                                 sample_size_ssu,
#                                 grid = seq(0, 1, by=0.1)){
#
#
#   # Generating N_pop populations
#   pop <-
#     replicate(N_pop,
#               generate_pop_clust_strat(N = N,
#                                        freq_strata = param_pop$prob,
#                                        mean0_strat = param_pop$mu0,
#                                        mean1_strat = param_pop$mu1,
#                                        sigma0_strat = param_pop$sigma0,
#                                        sigma1_strat = param_pop$sigma1,
#                                        p1 = p1,
#                                        cluster_size = cluster_size_strat,
#                                        tau = tau),
#               simplify = FALSE)
#
#   # Computing finite population ROC
#   tpr_pop <-
#     pop %>%
#     map_dfr(estimate_survey_roc,
#             grid = grid,
#             .id = 'population') %>%
#     select(population, fpr, tpr_pop = tpr)
#
#   # For each population, generating N_sample samples and computing ROC
#   sim <-
#     pop %>%
#     map_dfr(simulate_sample_tscs_strat,
#             N_sample = N_sample,
#             sample_size_psu = sample_size_psu,
#             sample_size_ssu = sample_size_ssu,
#             grid = grid,
#             .id = 'population') %>%
#     left_join(tpr_pop, by = c('fpr', 'population'))
#
#   return(sim)
#
# }

simulate_tscs_strat <- function(N_pop,
                                N,
                                param_pop,
                                p1,
                                cluster_size_strat,
                                tau = 1,
                                N_sample,
                                sample_size_psu,
                                sample_size_ssu,
                                grid = seq(0, 1, by=0.1)){


  # Generating N_pop populations
  pop <-
    replicate(N_pop,
              generate_pop_clust_strat(N = N,
                                       freq_strata = param_pop$prob,
                                       mean0_strat = param_pop$mu0,
                                       mean1_strat = param_pop$mu1,
                                       sigma0_strat = param_pop$sigma0,
                                       sigma1_strat = param_pop$sigma1,
                                       p1 = p1,
                                       cluster_size = cluster_size_strat,
                                       tau = tau),
              simplify = FALSE)

  # Computing finite population ROC
  tpr_pop <-
    pop %>%
    map(~svydesign(id =~1, strata =~strata, weights =~1, fpr =~strata_size, data = .x)) %>%
    map_dfr(estimate_survey_roc,
            grid = grid,
            .id = 'population') %>%
    #select(population, fpr, tpr_pop = tpr)
    select(population, grid, tpr_pop = tpr)

  # For each population, generating N_sample samples and computing ROC
  sim <-
    pop %>%
    map_dfr(simulate_sample_tscs_strat,
            N_sample = N_sample,
            sample_size_psu = sample_size_psu,
            sample_size_ssu = sample_size_ssu,
            grid = grid,
            .id = 'population') %>%
    left_join(tpr_pop, by = c('grid', 'population'))

  return(sim)

}
tamytsujimoto/surveyROC documentation built on Sept. 8, 2021, 11:35 p.m.