inst/doc/Simulating-Power.R

## ----set_options, echo=FALSE---------------------------------------------
# This file is the actual vignette file, which uses prepopulated data objects
# from the file in the `vignettes-build` directory. To update the vignette, run
# the code in the `vignettes-build` file first, and then knit this file.
knitr::opts_chunk$set(collapse=TRUE, comment='#>')

## ----load_functions, message=FALSE---------------------------------------
# load all packages used in this vignette
library('paramtest')
library('pwr')
library('ggplot2')
library('knitr')
library('nlme')
library('lavaan')
library('dplyr')

## ----load_data, echo=FALSE-----------------------------------------------
load('power_data.Rdata')

## ----pwr_ttest-----------------------------------------------------------
pwr.t.test(n=50, d=.5, type='two.sample')

## ----sim_ttest, eval=FALSE-----------------------------------------------
#  # create user-defined function to generate and analyze data
#  t_func <- function(simNum, N, d) {
#      x1 <- rnorm(N, 0, 1)
#      x2 <- rnorm(N, d, 1)
#  
#      t <- t.test(x1, x2, var.equal=TRUE)  # run t-test on generated data
#      stat <- t$statistic
#      p <- t$p.value
#  
#      return(c(t=stat, p=p, sig=(p < .05)))
#          # return a named vector with the results we want to keep
#  }
#  
#  power_ttest <- run_test(t_func, n.iter=5000, output='data.frame', N=50, d=.5)  # simulate data
#  results(power_ttest) %>%
#      summarise(power=mean(sig))

## ----sim_ttest_out, echo=FALSE-------------------------------------------
results(power_ttest) %>%
    summarise(power=mean(sig))

## ----vary_params, eval=FALSE---------------------------------------------
#  # give 'params' a list of parameters we want to vary;
#  # testing at N=25, N=50, and N=100
#  power_ttest_vary <- grid_search(t_func, params=list(N=c(25, 50, 100)),
#      n.iter=5000, output='data.frame', d=.5)
#  results(power_ttest_vary) %>%
#      group_by(N.test) %>%
#      summarise(power=mean(sig))

## ----vary_params_out, echo=FALSE-----------------------------------------
results(power_ttest_vary) %>%
    group_by(N.test) %>%
    summarise(power=mean(sig)) %>%
    kable(digits=3)

## ----vary_params2, eval=FALSE--------------------------------------------
#  # varying N and Cohen's d
#  power_ttest_vary2 <- grid_search(t_func, params=list(N=c(25, 50, 100), d=c(.2, .5)),
#      n.iter=5000, output='data.frame')
#  power <- results(power_ttest_vary2) %>%
#      group_by(N.test, d.test) %>%
#      summarise(power=mean(sig))
#  print(power)
#  ggplot(power, aes(x=N.test, y=power, group=factor(d.test), colour=factor(d.test))) +
#      geom_point() +
#      geom_line() +
#      ylim(c(0, 1)) +
#      labs(x='Sample Size', y='Power', colour="Cohen's d") +
#      theme_minimal()

## ----vary_params2_out, echo=FALSE, fig.width=6, fig.height=4-------------
power <- results(power_ttest_vary2) %>%
    group_by(N.test, d.test) %>%
    summarise(power=mean(sig))
kable(power, digits=3)
ggplot(power, aes(x=N.test, y=power, group=factor(d.test), colour=factor(d.test))) +
    geom_point() +
    geom_line() +
    ylim(c(0, 1)) +
    labs(x='Sample Size', y='Power', colour="Cohen's d") +
    theme_minimal()

## ----parallel1, eval=FALSE-----------------------------------------------
#  # time with no parallel processing (your mileage may very greatly)
#  system.time(power_ttest_vary3 <- grid_search(t_func,
#      params=list(N=c(25, 50, 100), d=c(.2, .5)), n.iter=5000, output='data.frame'))

## ----parallel1_out, echo=FALSE-------------------------------------------
print(power_ttest_time1)

## ----parallel2, eval=FALSE-----------------------------------------------
#  # using parallel processing; I am using a Windows system, so I use parallel='snow';
#  # see the documentation for the 'parallel' package for more details
#  system.time(power_ttest_vary4 <- grid_search(t_func,
#      params=list(N=c(25, 50, 100), d=c(.2, .5)), n.iter=5000, output='data.frame', parallel='snow', ncpus=4))

## ----parallel2_out, echo=FALSE-------------------------------------------
print(power_ttest_time2)

## ----bootstrap, eval=FALSE-----------------------------------------------
#  # user function must take data and indices as first two arguments; see 'boot'
#  # package documentation for more details
#  t_func_boot <- function(data, indices) {
#      sample_data <- data[indices, ]
#      treatGroup <- sample_data[sample_data$group == 'trt2', 'weight']
#      ctrlGroup <- sample_data[sample_data$group == 'ctrl', 'weight']
#  
#      t <- t.test(treatGroup, ctrlGroup, var.equal=TRUE)
#      stat <- t$statistic
#      p <- t$p.value
#  
#      return(c(t=stat, p=p, sig=(p < .05)))
#  }
#  
#  # example using built-in dataset PlantGrowth
#  power_ttest_boot <- run_test(t_func_boot, n.iter=5000, output='data.frame', boot=TRUE,
#      bootParams=list(data=PlantGrowth))
#  results(power_ttest_boot) %>%
#      summarise(power=mean(sig))

## ----bootstrap_out, echo=FALSE-------------------------------------------
results(power_ttest_boot) %>%
    summarise(power=mean(sig))

## ----sim_linear, eval=FALSE----------------------------------------------
#  lm_test <- function(simNum, N, b1, b0=0, xm=0, xsd=1) {
#      x <- rnorm(N, xm, xsd)
#      y <- rnorm(N, b0 + b1*x, sqrt(1 - b1^2))  # var. approx. 1 after accounting
#                                                # for explained variance by x
#      model <- lm(y ~ x)
#  
#      # pull output from model
#      est <- coef(summary(model))['x', 'Estimate']
#      se <- coef(summary(model))['x', 'Std. Error']
#      p <- coef(summary(model))['x', 'Pr(>|t|)']
#  
#      return(c(xm=mean(x), xsd=sd(x), ym=mean(y), ysd=sd(y), est=est, se=se, p=p,
#          sig=(p < .05)))
#  }
#  
#  # we vary N at 200 and 300; we are also setting coefficient of x predicting
#  # y to be approx. .15 across all simulations
#  power_lm <- grid_search(lm_test, params=list(N=c(200, 300)), n.iter=5000, output='data.frame', b1=.15,
#      parallel='snow', ncpus=4)
#  results(power_lm) %>%
#      group_by(N.test) %>%
#      summarise(power=mean(sig))

## ----sim_linear_out, echo=FALSE------------------------------------------
results(power_lm) %>%
    group_by(N.test) %>%
    summarise(power=mean(sig)) %>%
    kable(digits=3)

## ----pwr_linear----------------------------------------------------------
# f2 = R^2 / (1 - R^2)
pwr.f2.test(u=1, v=c(200-2, 300-2), f2=(.15^2) / (1 - .15^2))

## ----sim_linear2, eval=FALSE---------------------------------------------
#  lm_test_interaction <- function(simNum, N, b1, b2, b3, b0=0, x1m=0, x1sd=1,
#      x2m=0, x2sd=1) {
#  
#      x1 <- rnorm(N, x1m, x1sd)
#      x2 <- rnorm(N, x2m, x2sd)
#      yvar <- sqrt(1 - b1^2 - b2^2 - b3^2)  # residual variance
#      y <- rnorm(N, b0 + b1*x1 + b2*x2 + b3*x1*x2, yvar)
#      model <- lm(y ~ x1 * x2)
#  
#      # pull output from model (two main effects and interaction)
#      est_x1 <- coef(summary(model))['x1', 'Estimate']
#      p_x1 <- coef(summary(model))['x1', 'Pr(>|t|)']
#      sig_x1 <- p_x1 < .05
#      est_x2 <- coef(summary(model))['x2', 'Estimate']
#      p_x2 <- coef(summary(model))['x2', 'Pr(>|t|)']
#      sig_x2 <- p_x2 < .05
#      est_int <- coef(summary(model))['x1:x2', 'Estimate']
#      p_int <- coef(summary(model))['x1:x2', 'Pr(>|t|)']
#      sig_int <- p_int < .05
#  
#      return(c(est_x1=est_x1, p_x1=p_x1, sig_x1=sig_x1, est_x2=est_x2, p_x2=p_x2,
#          sig_x2=sig_x2, est_int=est_int, p_int=p_int, sig_int=sig_int))
#  }
#  
#  # varying N at 200 and 300; setting coefficient of x1 = .15, coefficient of
#  # x2 = 0, and coefficien of interaction = .3
#  power_lm_int <- grid_search(lm_test_interaction, params=list(N=c(200, 300)),
#      n.iter=5000, output='data.frame', b1=.15, b2=0, b3=.3, parallel='snow', ncpus=4)
#  results(power_lm_int) %>%
#      group_by(N.test) %>%
#      summarise(
#          power_x1=mean(sig_x1),
#          power_x2=mean(sig_x2),
#          power_int=mean(sig_int))

## ----sim_linear2_out, echo=FALSE-----------------------------------------
results(power_lm_int) %>%
    group_by(N.test) %>%
    summarise(
        power_x1=mean(sig_x1),
        power_x2=mean(sig_x2),
        power_int=mean(sig_int)) %>%
    kable(digits=3)

## ----sim_simple_eff, eval=FALSE------------------------------------------
#  lm_test_simple <- function(simNum, N, b1, b2, b3, b0=0, x1m=0, x1sd=1, x2m=0, x2sd=1) {
#      x1 <- rnorm(N, x1m, x1sd)
#      x2 <- rnorm(N, x2m, x2sd)
#      yvar <- sqrt(1 - b1^2 - b2^2 - b3^2)
#      y <- rnorm(N, b0 + b1*x1 + b2*x2 + b3*x1*x2, yvar)
#      model <- lm(y ~ x1 * x2)  # here is the original model
#  
#      est_int <- coef(summary(model))['x1:x2', 'Estimate']
#      p_int <- coef(summary(model))['x1:x2', 'Pr(>|t|)']
#      sig_int <- p_int < .05
#  
#      # calculate x1 at +/- 1 SD, to look at simple effects
#      x1minus1sd <- x1 - mean(x1) + sd(x1)
#      x1plus1sd <- x1 - mean(x1) - sd(x1)
#  
#      # new models to examine simple effects
#      model2 <- lm(y ~ x1minus1sd * x2)
#      model3 <- lm(y ~ x1plus1sd * x2)
#  
#      # test effect of x2 when x1 is at +/- 1 SD
#      est_x2_minus1 <- coef(summary(model2))['x2', 'Estimate']
#      p_x2_minus1 <- coef(summary(model2))['x2', 'Pr(>|t|)']
#      sig_x2_minus1 <- p_x2_minus1 < .05
#  
#      est_x2_plus1 <- coef(summary(model3))['x2', 'Estimate']
#      p_x2_plus1 <- coef(summary(model3))['x2', 'Pr(>|t|)']
#      sig_x2_plus1 <- p_x2_plus1 < .05
#  
#      return(c(est_int=est_int, p_int=p_int, sig_int=sig_int,
#          est_x2_minus1=est_x2_minus1, p_x2_minus1=p_x2_minus1,
#          sig_x2_minus1=sig_x2_minus1, est_x2_plus1=est_x2_plus1,
#          p_x2_plus1=p_x2_plus1, sig_x2_plus1=sig_x2_plus1))
#  }
#  
#  power_lm_simple <- grid_search(lm_test_simple, params=list(N=c(200, 300)),
#      n.iter=5000, output='data.frame', b1=.15, b2=0, b3=.3, parallel='snow', ncpus=4)
#  results(power_lm_simple) %>%
#      group_by(N.test) %>%
#      summarise(
#          power_x2_minus1=mean(sig_x2_minus1),
#          power_x2_plus1=mean(sig_x2_plus1))

## ----sim_simple_eff_out, echo=FALSE--------------------------------------
results(power_lm_simple) %>%
    group_by(N.test) %>%
    summarise(
        power_x2_minus1=mean(sig_x2_minus1),
        power_x2_plus1=mean(sig_x2_plus1)) %>%
    kable(digits=3)

## ----sim_mlm, eval=FALSE-------------------------------------------------
#  mlm_test <- function(simNum, N, b1, b0=0, xm=0, xsd=1, varInt=1, varSlope=1, varResid=1) {
#      timePoints <- 4
#      subject <- rep(1:N, each=timePoints)
#      sub_int <- rep(rnorm(N, 0, sqrt(varInt)), each=timePoints)  # random intercept
#      sub_slope <- rep(rnorm(N, 0, sqrt(varSlope)), each=timePoints)  # random slope
#      time <- rep(0:(timePoints-1), N)
#      y <- (b0 + sub_int) + (b1 + sub_slope)*time + rnorm(N*timePoints, 0, sqrt(varResid))
#          # y-intercept as a function of b0 plus random intercept;
#          # slope as a function of b1 plus random slope
#      data <- data.frame(subject, sub_int, sub_slope, time, y)
#  
#      # for more complex models that might not converge, tryCatch() is probably
#      # a good idea
#      return <- tryCatch({
#          model <- nlme::lme(y ~ time, random=~time|subject, data=data)
#              # when using parallel processing, we must refer to functions from
#              # packages directly, e.g., package::function()
#  
#          est <- summary(model)$tTable['time', 'Value']
#          se <- summary(model)$tTable['time', 'Std.Error']
#          p <- summary(model)$tTable['time', 'p-value']
#          return(c(est=est, se=se, p=p, sig=(p < .05)))
#      },
#      error=function(e) {
#          #message(e)  # print error message
#          return(c(est=NA, se=NA, p=NA, sig=NA))
#      })
#  
#      return(return)
#  }
#  
#  # I am cutting this down to 500 iterations so that the document compiles faster; I would, however,
#  # recommend more iterations for a stable estimate
#  power_mlm <- grid_search(mlm_test, params=list(N=c(200, 300)), n.iter=500, output='data.frame', b1=.15,
#      varInt=.05, varSlope=.15, varResid=.4, parallel='snow', ncpus=4)
#  results(power_mlm) %>%
#      group_by(N.test) %>%
#      summarise(
#          power=mean(sig, na.rm=TRUE),
#          na=sum(is.na(sig)))  # we use this to count up how many cases did not properly converge

## ----sim_mlm_out, echo=FALSE---------------------------------------------
results(power_mlm) %>%
    group_by(N.test) %>%
    summarise(
        power=mean(sig, na.rm=TRUE),
        na=sum(is.na(sig))) %>%  # we use this to count up how many cases did not properly converge
    kable(digits=3)

## ----sim_mediation, eval=FALSE-------------------------------------------
#  med_test <- function(simNum, N, aa, bb, cc) {
#      x <- rnorm(N, 0, 1)
#      m <- rnorm(N, aa*x, sqrt(1 - aa^2))
#      y <- rnorm(N, cc*x + bb*m, sqrt(1 - cc^2 - bb^2))
#      data <- data.frame(x, m, y)
#  
#      # set up lavaan model to calculate indirect effect (ab) and total effect
#      model <- '
#          m ~ a*x
#          y ~ c*x
#          y ~ b*m
#          ab := a*b
#          total := c + (a*b)'
#  
#      fit <- lavaan::sem(model, data=data)
#      ests <- lavaan::parameterEstimates(fit)
#          # when using parallel processing, we must refer to functions from
#          # packages directly, e.g., package::function()
#  
#      # pull output from model
#      a_est <- ests[ests$label == 'a', 'est']
#      a_p <- ests[ests$label == 'a', 'pvalue']
#      b_est <- ests[ests$label == 'b', 'est']
#      b_p <- ests[ests$label == 'b', 'pvalue']
#      c_est <- ests[ests$label == 'c', 'est']
#      c_p <- ests[ests$label == 'c', 'pvalue']
#      ab_est <- ests[ests$label == 'ab', 'est']
#      ab_p <- ests[ests$label == 'ab', 'pvalue']
#  
#      return(c(a_est=a_est, a_p=a_p, b_est=b_est, b_p=b_p, c_est=c_est, c_p=c_p,
#          ab_est=ab_est, ab_p=ab_p, sig=(ab_p < .05)))
#  }
#  
#  # set up mediation model where x -> m = .15, m -> y = .2, and x -> y = .05
#  power_med <- grid_search(med_test, params=list(N=c(200, 300)), n.iter=1000, output='data.frame', aa=.15,
#      bb=.2, cc=.05, parallel='snow', ncpus=4)
#  results(power_med) %>%
#      group_by(N.test) %>%
#      summarise(
#          power=mean(sig, na.rm=TRUE))

## ----sim_mediation_out, echo=FALSE---------------------------------------
results(power_med) %>%
    group_by(N.test) %>%
    summarise(
        power=mean(sig, na.rm=TRUE)) %>%
    kable(digits=3)

## ----sim_latent, eval=FALSE----------------------------------------------
#  latent_test <- function(simNum, N, b1, ind1, ind2, ind3) {
#      # matrix of factor structure; we have x as observed predictor, and y is a
#      # latent variable with three indicators
#      fmodel <- matrix(
#          c(1, 0,      # x
#            0, ind1,   # y1
#            0, ind2,   # y2
#            0, ind3),  # y3
#          nrow=4, ncol=2, byrow=TRUE, dimnames=list(
#              c('x', 'y1', 'y2', 'y3'),  # rows (observed)
#              c('x', 'y')))              # cols (latent)
#  
#      # matrix of effects structure (variance-covariance); we are using x to
#      # predict y (with coefficient specified as b1)
#      y_resid_var <- sqrt(1 - b1^2)
#      effects <- matrix(
#          c(1, b1,            # x
#            0, y_resid_var),  # y
#          nrow=2, ncol=2, byrow=TRUE, dimnames=list(
#              c('x', 'y'), c('x', 'y')))
#  
#      data <- paramtest::gen_data(fmodel, effects)
#          # generates the data using factor and effects matrices
#  
#      model <- '
#          y =~ y1 + y2 + y3
#          y ~ b1*x'
#  
#      fit <- lavaan::sem(model, data=data)
#      ests <- lavaan::parameterEstimates(fit)
#  
#      est <- ests[ests$label == 'b1', 'est']
#      p <- ests[ests$label == 'b1', 'pvalue']
#  
#      return(c(est=est, p=p, sig=(p < .05)))
#  }
#  
#  power_sem <- grid_search(latent_test, params=list(N=c(200, 300)), n.iter=1000, output='data.frame',
#      b1=.15, ind1=.4, ind2=.4, ind3=.4, parallel='snow', ncpus=4)
#  results(power_sem) %>%
#      group_by(N.test) %>%
#      summarise(
#          power=mean(sig, na.rm=TRUE))

## ----sim_latent_out, echo=FALSE------------------------------------------
results(power_sem) %>%
    group_by(N.test) %>%
    summarise(
        power=mean(sig, na.rm=TRUE)) %>%
    kable(digits=3)

Try the paramtest package in your browser

Any scripts or data that you put into this service are public.

paramtest documentation built on Nov. 17, 2017, 7:11 a.m.