In lebebr01/simglm: Simulate Models Based on the Generalized Linear Model

library(knitr)
library(simglm)
knit_print.data.frame = function(x, ...) {
  res = paste(c('', '', kable(x, output = FALSE)), collapse = '\n')
  asis_output(res)
}

Deprecated

This functionality has been deprecated and is no longer supported. See tidy_simulation vignette.

Power Analysis with simglm

The simglm package allows the ability to conduct a power analysis through simulation. This will be particularly helpful with multilevel models and generalized linear models. To show the process, we will start with basic regression models.

Single Level Power Analysis

Let's look at a simple single level regression example to get started:

fixed <- ~ 1 + act + diff + numCourse + act:numCourse
fixed_param <- c(0.5, 1.1, 0.6, 0.9, 1.1)
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                  var_type = c("single", "single", "single"),
                  opts = list(list(mean = 0, sd = 2),
                              list(mean = 0, sd = 2),
                              list(mean = 0, sd = 1)))
n <- 150
error_var <- 20
with_err_gen <- 'rnorm'
pow_param <- c('(Intercept)', 'act', 'diff', 'numCourse')
alpha <- .01
pow_dist <- "t"
pow_tail <- 2
replicates <- 100
power_out <- sim_pow(fixed = fixed, fixed_param = fixed_param, 
                     cov_param = cov_param, n = n, error_var = error_var,
                     with_err_gen = with_err_gen, data_str = "single",
                     pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates)

Much of the output here is the same from the sim_reg function. The additional arguments, pow_param represents the terms to conduct a power analysis for and must be a subset of the fixed argument, alpha represents the per term level of significance, pow_dist represents the sampling distribution to refer to, either 'z' or 't', pow_tail represents whether a one or two tailed hypothesis is being tested, and replicates represents the number of simulations to conduct. Note, to do a power analysis for the intercept, '(Intercept)' must be used. By default, if pow_param is not specified power is conducted for all terms.

Finally, looking at the output from the above call:

head(power_out)

The output contains the variable name, the average test statistic, the standard deviation of the test statistic, the power rate, the number of null hypotheses rejects, and the total number of replications. Increasing the number of replications would increase the precision of the power analysis, however may significantly increase the computational time.

Standardized Coefficients

By default, the simglm package uses unstandardized regression coefficients when doing the simulation. A way to use standardized coefficients however, would be to generate standardized variables. For example:

fixed <- ~ 1 + act + diff + numCourse + act:numCourse
fixed_param <- c(0.2, 0.4, 0.25, 0.7, 0.1)
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                  var_type = c("single", "single", "single"),
                  opts = list(list(mean = 0, sd = 1),
                              list(mean = 0, sd = 1),
                              list(mean = 0, sd = 1)))
n <- 150
error_var <- 1
with_err_gen <- 'rnorm'
pow_param <- c('(Intercept)', 'act', 'diff', 'numCourse')
alpha <- .01
pow_dist <- "t"
pow_tail <- 2
replicates <- 100
power_out <- sim_pow(fixed = fixed, fixed_param = fixed_param, 
                     cov_param = cov_param,n = n, error_var = error_var,
                     with_err_gen = with_err_gen, data_str = "single",
                     pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates)
head(power_out)

Varying Arguments

fixed <- ~ 1 + act + diff + numCourse + act:numCourse
fixed_param <- c(0.5, 1.1, 0.6, 0.9, 1.1)
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'), 
                  var_type = c("single", "single", "single"),
                  opts = list(list(mean = 0, sd = 2),
                              list(mean = 0, sd = 2),
                              list(mean = 0, sd = 1)))
n <- NULL
error_var <- NULL
with_err_gen <- 'rnorm'
pow_param <- c('(Intercept)', 'act', 'diff', 'numCourse')
alpha <- .01
pow_dist <- "t"
pow_tail <- 2
replicates <- 10
terms_vary <- list(n = c(20, 40, 60, 80, 100), error_var = c(5, 10, 20),
                   fixed_param = list(c(0.5, 1.1, 0.6, 0.9, 1.1), 
                                      c(0.6, 1.1, 0.6, 0.9, 1.1)),
                cov_param = list(list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                                       mean = c(0, 0, 0), sd = c(2, 2, 1), 
                                  var_type = c("single", "single", "single")),
                                  list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                                       mean = c(0.5, 0, 0), sd = c(2, 2, 1), 
                                  var_type = c("single", "single", "single"))
                                  )
                   )
power_out <- sim_pow(fixed = fixed, fixed_param = fixed_param, 
                     cov_param = cov_param,
                     n = n, error_var = error_var, with_err_gen = with_err_gen, 
                     data_str = "single", pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates, terms_vary = terms_vary)

Model Misspecification

It is also possible to specify a model different than the generating model for evaluation of power. This may be useful if it is thought another variable is important in explaining variation, however due to design issues is not possible to collect this variable. As a result, there would likely be additional variation due to this variable that will not be able to be explained. Building this into the generating model can help provide additional information about the impact on power.

An example using single level power analysis is shown below.

fixed <- ~ 1 + act + diff + numCourse + act:numCourse
fixed_param <- c(0.5, 1.1, 0.6, 0.9, 1.1)
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                  var_type = c("single", "single", "single"),
                  opts = list(list(mean = 0, sd = 2),
                              list(mean = 0, sd = 2),
                              list(mean = 0, sd = 1)))
n <- 150
error_var <- 20
with_err_gen <- 'rnorm'
pow_param <- c('(Intercept)', 'act', 'diff', 'numCourse')
alpha <- .01
pow_dist <- "t"
pow_tail <- 2
replicates <- 100

lm_fit_mod <- sim_data ~ 1 + act + diff

power_out <- sim_pow(fixed = fixed, fixed_param = fixed_param, 
                     cov_param = cov_param, n = n, error_var = error_var,
                     with_err_gen = with_err_gen, data_str = "single",
                     pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates, lm_fit_mod = lm_fit_mod)
head(power_out)

You'll notice, compared to above, the power is somewhat reduced for the other three fixed effects when not modeling the numCourse variable. You can also build in the lm_fit_mod argument into the design via the terms_vary argument.

fixed <- ~ 1 + act + diff + numCourse + act:numCourse
fixed_param <- c(0.5, 1.1, 0.6, 0.9, 1.1)
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                  var_type = c("single", "single", "single"),
                  opts = list(list(mean = 0, sd = 2),
                              list(mean = 0, sd = 2),
                              list(mean = 0, sd = 1)))
n <- 150
error_var <- 20
with_err_gen <- 'rnorm'
pow_param <- c('(Intercept)', 'act', 'diff', 'numCourse')
alpha <- .01
pow_dist <- "t"
pow_tail <- 2
replicates <- 100

terms_vary <- list(n = c(20, 40, 60, 80, 100), error_var = c(5, 10, 20),
                   fixed_param = list(c(0.5, 1.1, 0.6, 0.9, 1.1), 
                                      c(0.6, 1.1, 0.6, 0.9, 1.1)),
                lm_fit_mod = list(sim_data ~ 1 + act + diff, 
                                  sim_data ~ 1 + act)
)

power_out <- sim_pow(fixed = fixed, fixed_param = fixed_param, 
                     cov_param = cov_param, n = n, error_var = error_var,
                     with_err_gen = with_err_gen, data_str = "single",
                     pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates, terms_vary = terms_vary)
head(power_out)

Nested Data

Extending the power analysis to nested data is fairly straightforward. Below is a three level example

fixed <- ~1 + time + diff + act + actClust + time:act
random <- ~1 + time
random3 <- ~ 1 + time
fixed_param <- c(4, 2, 6, 2.3, 7, 0)
random_param <- list(random_var = c(7, 4), rand_gen = 'rnorm')
random_param3 <- list(random_var = c(4, 2), rand_gen = 'rnorm')
cov_param <- list(dist_fun = c('rnorm', 'rnorm', 'rnorm'),
                  var_type = c("level1", "level2", "level3"),
                  opts = list(list(mean = 0, sd = 1.5),
                              list(mean = 0, sd = 4),
                              list(mean = 0, sd = 2)))
k <- 10
n <- 45
p <- 8
error_var <- 4
with_err_gen <- 'rnorm'
data_str <- "long"
pow_param <- c('time', 'diff', 'act', 'actClust')
alpha <- .01
pow_dist <- "z"
pow_tail <- 2
replicates <- 3
power_out <- sim_pow(fixed = fixed, random = random, random3 = random3,
                     fixed_param = fixed_param,
                     random_param = random_param, 
                     random_param3 = random_param3,
                     cov_param = cov_param,
                     k = k, n = n, p = p,
                     error_var = error_var, with_err_gen = "rnorm",
                     data_str = data_str, 
                     unbal = list(level2 = FALSE, level3 = FALSE),
                     pow_param = pow_param, alpha = alpha,
                     pow_dist = pow_dist, pow_tail = pow_tail, 
                     replicates = replicates)
head(power_out)

Generalized Power Analysis

fixed <- ~ 1 + act + diff
fixed_param <- c(0.1, 0.5, 0.3)
cov_param <- list(dist_fun = c('rnorm', 'rnorm'),
                  var_type = c("single", "single"),
                  opts = list(list(mean = 0, sd = 2),
                              list(mean = 0, sd = 4)))
n <- 50
pow_param <- c('(Intercept)', 'act', 'diff')
alpha <- .01
pow_dist <- "z"
pow_tail <- 2
replicates <- 10

power_out <- sim_pow_glm(fixed = fixed, fixed_param = fixed_param, 
                         cov_param = cov_param, 
                         n = n, data_str = "single", 
                         outcome_type = 'logistic',
                         pow_param = pow_param, alpha = alpha,
                         pow_dist = pow_dist, pow_tail = pow_tail, 
                         replicates = replicates)
head(power_out)

Vary Arguments

fixed <- ~ 1 + act + diff
fixed_param <- c(0.1, 0.5, 0.3)
cov_param <- list(dist_fun = c('rnorm', 'rnorm'),
                  var_type = c("single", "single"),
                  opts = list(list(mean = 0, sd = 5),
                              list(mean = 0, sd = 8)))
n <- NULL
pow_param <- c('(Intercept)', 'act', 'diff')
alpha <- .01
pow_dist <- "z"
pow_tail <- 2
replicates <- 10
terms_vary <- list(n = c(20, 40, 60, 80, 100),
                   fixed_param = list(c(0.5, 0.1, 0.2), 
                                      c(0.6, 0.1, 0.2)))

power_out <- sim_pow_glm(fixed = fixed, fixed_param = fixed_param, 
                         cov_param = cov_param, 
                         n = n, data_str = "single", 
                         outcome_type = 'logistic',
                         pow_param = pow_param, alpha = alpha,
                         pow_dist = pow_dist, pow_tail = pow_tail, 
                         replicates = replicates, terms_vary = terms_vary)
head(power_out)