auxiliary/t-test-simulation.R

#---------------------------------------------------------------
# Generate one simulated dataset
#---------------------------------------------------------------

generate_chisq_samples <- function(nA, nB, mu_A, mu_B) {

  YA <- rchisq(n = nA, df = mu_A)
  YB <- rchisq(n = nB, df = mu_B)

  sample_data <- data.frame(
    group = rep(c("A","B"), times = c(nA, nB)),
    Y = c(YA, YB)
  )
  return(sample_data)
}


sample_data <- generate_chisq_samples(nA = 4, nB = 10, mu_A = 5, mu_B = 7)
sample_data


#---------------------------------------------------------------
# Function for Welch's t-test
# https://en.wikipedia.org/wiki/Welch%27s_t-test
#---------------------------------------------------------------

t_test <- function(sample_data) {

  # calculate raw summary statistics
  Ns <- table(sample_data$group)
  means <- tapply(sample_data$Y, sample_data$group, mean)
  sds <- tapply(sample_data$Y, sample_data$group, sd)

  # t-test
  tstat <- (means[[2]] - means[[1]]) / sqrt(sum(sds^2 / Ns))
  df <- sum(sds^2 / Ns)^2 / sum(sds^4 / (Ns^2 * (Ns - 1)))
  pval <- 2 * pt(q = abs(tstat), df = df, lower.tail = FALSE)

  return(data.frame(tstat, df, pval))
}

t_test(sample_data)


#---------------------------------------------------------------
# Run all the analysis
#---------------------------------------------------------------

run_t_tests <- function(sample_data) {
  t_raw <- t_test(sample_data)
  t_raw$transform <- "raw"

  log_data <- data.frame(
    group = sample_data$group,
    Y = log(sample_data$Y)
  )
  t_log <- t_test(log_data)
  t_log$transform <- "log"

  return(rbind(t_raw, t_log))
}

run_t_tests(sample_data)


#---------------------------------------------------------------
# Repeat stuff with map()
#---------------------------------------------------------------

library(dplyr)
library(purrr)

replications <- map(1:5, ~ {
  dat <- generate_chisq_samples(nA = 4, nB = 10, mu_A = 5, mu_B = 7)
  run_t_tests(dat)
})

replications # output is a list of results


# Stack the results using map_dfr()

replications <- map_dfr(1:5, ~ {
  dat <- generate_chisq_samples(nA = 4, nB = 10, mu_A = 5, mu_B = 7)
  run_t_tests(dat)
})

replications

#---------------------------------------------------------------
# Estimating rejection rates
#---------------------------------------------------------------

# Now with 20 replications

replications <- map_dfr(1:20, ~ {
  generate_chisq_samples(nA = 4, nB = 10, mu_A = 5, mu_B = 7) %>%
    run_t_tests()
})

# Calculate rejection rates from p-values

replications %>%
  group_by(transform) %>%
  summarize(
    rate_05 = mean(pval < .05),
    rate_10 = mean(pval < .10)
  )

#---------------------------------------------------------------
# Function to simulate a single scenario
#---------------------------------------------------------------

run_sim <- function(reps, nA, nB, mu_A, mu_B) {
  replications <-
    map_dfr(1:reps, ~ {
      dat <- generate_chisq_samples(nA = nA, nB = nB, mu_A = mu_A, mu_B = mu_B)
      run_t_tests(dat)
    })

  replications %>%
    group_by(transform) %>%
    summarize(
      rate_05 = mean(pval < .05),
      rate_10 = mean(pval < .10)
    )
}

run_sim(reps = 1000, nA = 4, nB = 7, mu_A = 10, mu_B = 10)

# Reproducible run_sim()

run_sim <- function(reps, nA, nB, mu_A, mu_B, seed = NULL) {

  if (!is.null(seed)) set.seed(seed)

  replications <-
    map_dfr(1:reps, ~ {
      dat <- generate_chisq_samples(nA = nA, nB = nB, mu_A = mu_A, mu_B = mu_B)
      run_t_tests(dat)
    }) %>%
    bind_rows()

  replications %>%
    group_by(transform) %>%
    summarize(
      rate_05 = mean(pval < .05),
      rate_10 = mean(pval < .10)
    )
}

run_sim(50, nA=8, nB=12, mu_A=15, mu_B=15, seed = 20200311)
run_sim(50, nA=8, nB=12, mu_A=15, mu_B=15, seed = 20200311)



#---------------------------------------------------------------
# Simulation experiments
#---------------------------------------------------------------

# The hard way

run_sim(1000, nA=4, nB=8, mu_A=5, mu_B=5)
run_sim(1000, nA=4, nB=8, mu_A=10, mu_B=10)
run_sim(1000, nA=4, nB=8, mu_A=15, mu_B=15)

#---------------------------------------------------------------
# Creating an experimental design
#---------------------------------------------------------------
library(tidyr)

design_factors <- list(
  mu_A = c(5, 10, 15, 20),
  nA = c(4, 8, 16),
  nB = c(4, 8, 16)
)

params <-
  expand_grid(!!!design_factors) %>%
  mutate(
    mu_B = mu_A,
    reps = 1000,
    seed = 20200311 + 1:n()
  )

params # A "menu" of conditions we would like to study


#---------------------------------------------------------------
# Iterate over conditions using pmap()
#---------------------------------------------------------------

sim_results <- pmap(params, .f = run_sim)


# store the results as a new variable with the parameter values

sim_results <- params
sim_results$res <- pmap(params, .f = run_sim)


# Use unnest() to expand the res variable
unnest(sim_results, cols = res)


# A tidy workflow

sim_results <-
  params %>%
  mutate(res = pmap(., .f = run_sim)) %>%
  unnest(cols = res)

sim_results

#---------------------------------------------------------------
# Parallel processing
#---------------------------------------------------------------

parallel::detectCores()

library(future)
library(furrr)

plan(multiprocess) # choose an appropriate plan from future package

sim_results <-
  params %>%
  mutate(res = future_pmap(., .f = run_sim)) %>%
  unnest(cols = res)


# In serial

system.time(
  params %>%
    mutate(
      res = pmap(., .f = run_sim)
    ) %>%
    unnest(cols = res)
)


# In parallel

system.time(
  params %>%
    mutate(
      res = future_pmap(., .f = run_sim)
    ) %>%
    unnest(cols = res)
)


#---------------------------------------------------------------
# Visualization
#---------------------------------------------------------------

library(ggplot2)

ggplot(sim_results,
       aes(x = mu_A, y = rate_05,
           shape = transform, color = transform)) +
  geom_point() +
  geom_line() +
  facet_grid(nA ~ nB, labeller = "label_both") +
  expand_limits(y = 0) +
  geom_hline(yintercept = .05) +
  theme_light()

ggplot(sim_results,
       aes(x = mu_A, y = rate_10,
           shape = transform, color = transform)) +
  geom_point() +
  geom_line() +
  facet_grid(nA ~ nB, labeller = "label_both") +
  expand_limits(y = 0) +
  geom_hline(yintercept = .10) +
  theme_light()
meghapsimatrix/SimHelpers documentation built on Jan. 14, 2025, 5:16 a.m.