#' Random generator of Poisson distribution with mininum value
#'
#' @param min minimum allowed for cluster size
#' @param lambda numeric value for expected cluster size for poisson distr
#'
#' @export
rpois_min <- function(min, lambda){
size <- 0
while(size < min) size <- rpois(1, lambda)
return(size)
}
#' Generate cluster size with specified mininum size allowed
#'
#' @param n_cluster numeric value for number of clusters to be generated
#' @param min minimum allowed for cluster size
#' @param lambda numeric value for expected cluster size for poisson distr
#'
#' @export
generate_clust_size <- function(n_cluster,
min,
lambda){
size <- replicate(n_cluster, rpois_min(min, lambda))
return(size)
}
#' Generate one cluster of binormal distributions
#'
#' Generate one cluster containing healthy and diseased observations generated from normal distributions.
#' The correlated binary data is generated using the multivariate probit method.
#' The correlated continuous marker values are generated by including a random effect
#'
#' @param size_clus size of cluster
#' @param corr_mvn numeric value for correlation in multivariate normal distribution
#' @param mean0 numeric vector of mean for class label 0 (healthy) for all strata
#' @param mean1 numeric vector of mean for class label 1 (diseased) for all strata
#' @param sigma0 numeric vector of standard deviation for class label 0 (healthy) for all strata
#' @param sigma1 numeric vector of standard deviation for class label 1 (diseased) for all strata
#' @param p1 numeric value for proportion of diseased observations in the population
#' @tau numeric value for standard deviation of mixed effect of cluster
#'
#' @return Dataframe containing
#' \itemize{
#' \item cluster_size: cluster size
#' \item X: continuous marker values
#' \item D: class labels (0: healthy; 1: diseased)
#' }
#'
#' @export
genenate_clust <- function(size_clus,
corr_mvn,
mean0,
mean1,
sigma0,
sigma1,
p1,
tau){
# Defining covariance matrix
cov <- diag(rep(1, size_clus))
cov[upper.tri(cov)] <- corr_mvn
cov[lower.tri(cov)] <- corr_mvn
# Generating multivariate normal
z <- MASS::mvrnorm(n = 1,
mu = rep(0, size_clus),
Sigma = cov)
# Generating binary data
D <- ifelse(z < qnorm(p1), 1, 0)
clus_effect <- rnorm(n = 1, sd = tau) # random effect of cluster
X0 <- rnorm(size_clus, mean0, sigma0)
X1 <- rnorm(size_clus, mean1, sigma1)
return(data.frame(cluster_size = size_clus,
X = (1-D)*X0 +D*X1 + clus_effect,
D = D))
}
#' Generate a population of clustered observations
#'
#' @param mean0 numeric value of mean for class label 0 (healthy)
#' @param mean1 numeric value of mean for class label 1 (diseased)
#' @param sigma0 numeric value of standard deviation for class label 0 (healthy)
#' @param sigma1 numeric value of standard deviation for class label 1 (diseased)
#' @param N
#' @param cluster_size
#' @param tau
#' @param p1 numeric value for proportion of diseased observations in the population
#'
#' @tau numeric value for standard deviation of mixed effect of cluster
#' @return Dataframe containing
#' \itemize{
#' \item cluster_size: cluster size
#' \item X: continuous marker values
#' \item D: class labels (0: healthy; 1: diseased)
#' }
#'
#' @export
generate_pop_clust <- function(N,
mean0,
mean1,
sigma0,
sigma1,
p1,
cluster_size,
tau) {
# Generating population
pop <- generate_pop(N, mean0, mean1, sigma0, sigma1, p1)
if(length(cluster_size) == 1){ #fixed cluster size
# Number of clusters
N_cluster <- N/cluster_size
# Adding another layer of error
pop_clus <-
pop %>%
mutate(X = X + rnorm(N, sd = tau),
cluster_id = ntile(X, N_cluster))
} else{ # varying cluster size
N_cluster <- length(cluster_size)
pop_clus <-
pop %>%
mutate(X = X + rnorm(N, sd = tau)) %>%
arrange(X) %>%
mutate(cluster_id = rep(1:N_cluster, sample(cluster_size)))
}
return(pop_clus)
}
#' Title
#'
#' @param N
#' @param freq_strata
#' @param mean0_strat
#' @param mean1_strat
#' @param sigma0_strat
#' @param sigma1_strat
#' @param p1
#' @param cluster_size_strat list of vector of cluster sizes for each strata. cluster sizes should be dimension 1 if cluster sizes are fixed.
#' Should have dimension = number of clusters if cluster sizes are varying
#' @param tau
#'
#' @return
#' @export
#'
#' @examples
generate_pop_clust_strat <- function(N,
freq_strata,
mean0_strat,
mean1_strat,
sigma0_strat,
sigma1_strat,
p1,
cluster_size_strat,
tau) {
N_strata <- length(freq_strata)
size_strata <- ceiling(N*freq_strata)
pop <-
purrr::pmap_dfr(list(as.list(size_strata),
as.list(mean0_strat),
as.list(mean1_strat),
as.list(sigma0_strat),
as.list(sigma1_strat),
replicate(N_strata, p1, simplify = FALSE),
cluster_size_strat,
replicate(N_strata, tau, simplify = FALSE)), # List of parameters for each strata
.f = generate_pop_clust,
.id = 'strata') %>% # Generating population for each strata
mutate(strata = as.numeric(strata)) %>%
group_by(strata, cluster_id) %>%
mutate(cluster_size = n()) %>%
ungroup %>%
group_by(strata) %>%
mutate(strata_size = n()) %>%
ungroup
return(pop)
}
#' Generate a Stratified Two-Stage Cluster sampling
#'
#' @param population dataframe containing population
#' @param n_psu numeric value for number of psu to be drawn from each strata
#' @param n_ssu numeric value for number of ssu to be drawn from each psu
#'
#' @export
generate_tscs_strat <- function(population,
n_psu,
n_ssu){
# Number of strata
N_strata <- length(table(population$strata))
# Number of PSU per strata
N_psu <- tapply(population$cluster_id, population$strata, max)
N_psu_df = data.frame(strata = 1:N_strata,
N_cluster = N_psu)
# Sampling psu
sample_psu <-
population %>%
group_by(strata) %>%
summarise(cluster_id = unique(cluster_id)) %>%
slice_sample(n = n_psu) %>%
left_join(population, by = c('strata', 'cluster_id')) # bringing information from selected clusters
# Sampling SSU
final_sample <-
sample_psu %>%
group_by(strata, cluster_id) %>%
slice_sample(n = n_ssu) %>%
ungroup %>%
left_join(N_psu_df, by = 'strata') %>%
mutate(weight = ifelse(N_cluster < n_psu & cluster_size < n_ssu, 1,
ifelse(N_cluster < n_psu & cluster_size >= n_ssu, cluster_size/n_ssu,
ifelse(N_cluster >= n_psu & cluster_size < n_ssu, N_cluster/n_psu, N_cluster*cluster_size/(n_psu*n_ssu)))))
return(final_sample)
}
#' Title
#'
#' @param pop
#' @param N_psu
#' @param sample_size_psu
#' @param sample_size_ssu
#' @param grid
#'
#' @return
#' @export
#'
#' @examples
# Second attempt
simulate_cum_tscs_strat2 <- function(pop,
sample_size_psu,
sample_size_ssu,
grid = NULL,
var_weight){
samples <-
pmap(list(n_psu = as.list(sample_size_psu),
n_ssu = as.list(sample_size_ssu)),
.f = generate_tscs_strat,
population = pop)
roc <-
samples %>%
map(~svydesign(id =~cluster_id, strata =~strata, weights =~weight, data = .x, fpc = ~strata_size, nest = TRUE)) %>%
map_dfr(estimate_survey_roc,
grid = grid,
var_weight = 'weight',
ci = TRUE,
.id = 'scenario')
return(roc)
}
#' Title
#'
#' @param pop
#' @param N_psu
#' @param N_sample
#' @param sample_size_psu
#' @param sample_size_ssu
#' @param grid
#'
#' @return
#' @export
#'
#' @examples
simulate_sample_tscs_strat <- function(pop,
N_sample,
sample_size_psu,
sample_size_ssu,
grid = NULL) {
# Generating and stacking samples
sim <-
replicate(N_sample,
simulate_cum_tscs_strat2(pop,
sample_size_psu,
sample_size_ssu,
grid = grid), simplify = FALSE) %>%
bind_rows(.id = 'sample')
return(sim)
}
#' Simulation function for stratified Two Stage Cluster Sampling
#'
#' Generates stratified populations and draw stratified SRS from each population.
#' Computes survey weighted ROC curve for each sample drawn form each of the genereated populations
#'
#' @param N_pop numeric value for number of stratified population to be generated
#' @param N_psu numeric vector of number os psu's (clusters) per strata
#' @param cluster_size_strat list of cluster sizes per strata
#' @param param_pop data frame containing population parameters (prob, mu0, mu1, sigma0, sigma1)
#' @param prop_disease numeric value for the proportion of diseased in population
#' @param N_sample numeric value for number of samples to be drawn from each population
#' @param sample_size_psu numeric value for number of psu selected per strata
#' @param sample_size_ssu numeric value for number of ssu selected per selected psu
#' @param grid numeric vector of points for roc curve from 0 to 1
#'
#' @return a dataframe containing containing survey weighted point and interval estimates for ROC curve for each
#' sample drawn from each of the generated populations
#'
#' @export
# simulate_tscs_strat <- function(N_pop,
# N,
# param_pop,
# p1,
# cluster_size_strat,
# tau = 1,
# N_sample,
# sample_size_psu,
# sample_size_ssu,
# grid = seq(0, 1, by=0.1)){
#
#
# # Generating N_pop populations
# pop <-
# replicate(N_pop,
# generate_pop_clust_strat(N = N,
# freq_strata = param_pop$prob,
# mean0_strat = param_pop$mu0,
# mean1_strat = param_pop$mu1,
# sigma0_strat = param_pop$sigma0,
# sigma1_strat = param_pop$sigma1,
# p1 = p1,
# cluster_size = cluster_size_strat,
# tau = tau),
# simplify = FALSE)
#
# # Computing finite population ROC
# tpr_pop <-
# pop %>%
# map_dfr(estimate_survey_roc,
# grid = grid,
# .id = 'population') %>%
# select(population, fpr, tpr_pop = tpr)
#
# # For each population, generating N_sample samples and computing ROC
# sim <-
# pop %>%
# map_dfr(simulate_sample_tscs_strat,
# N_sample = N_sample,
# sample_size_psu = sample_size_psu,
# sample_size_ssu = sample_size_ssu,
# grid = grid,
# .id = 'population') %>%
# left_join(tpr_pop, by = c('fpr', 'population'))
#
# return(sim)
#
# }
simulate_tscs_strat <- function(N_pop,
N,
param_pop,
p1,
cluster_size_strat,
tau = 1,
N_sample,
sample_size_psu,
sample_size_ssu,
grid = seq(0, 1, by=0.1)){
# Generating N_pop populations
pop <-
replicate(N_pop,
generate_pop_clust_strat(N = N,
freq_strata = param_pop$prob,
mean0_strat = param_pop$mu0,
mean1_strat = param_pop$mu1,
sigma0_strat = param_pop$sigma0,
sigma1_strat = param_pop$sigma1,
p1 = p1,
cluster_size = cluster_size_strat,
tau = tau),
simplify = FALSE)
# Computing finite population ROC
tpr_pop <-
pop %>%
map(~svydesign(id =~1, strata =~strata, weights =~1, fpr =~strata_size, data = .x)) %>%
map_dfr(estimate_survey_roc,
grid = grid,
.id = 'population') %>%
#select(population, fpr, tpr_pop = tpr)
select(population, grid, tpr_pop = tpr)
# For each population, generating N_sample samples and computing ROC
sim <-
pop %>%
map_dfr(simulate_sample_tscs_strat,
N_sample = N_sample,
sample_size_psu = sample_size_psu,
sample_size_ssu = sample_size_ssu,
grid = grid,
.id = 'population') %>%
left_join(tpr_pop, by = c('grid', 'population'))
return(sim)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.