ANOVApower: Simulation-Based Power Analysis for ANOVA Designs

Documented in ANOVA_design

#' Design function used to specify the parameters used in the simulation
#' @param design String specifying the ANOVA design.
#' @param n Sample size in each condition
#' @param mu Vector specifying mean for each condition
#' @param sd standard deviation for all conditions
#' @param r Correlation between dependent variables (single value or matrix)
#' @param labelnames Optional vector to specifying factor and condition names (recommended, if not used factors and levels are indicated by letters and numbers)
#' @param plot Should means plot be printed (defaults to TRUE)
#' @return Returns Single data-frame with simulated data, design, design list, factor names, formulas for ANOVA, means, sd, correlation, sample size per condition, correlation matrix, covariance matrix, design string, labelnames, labelnameslist, factor names, meansplot
#' @examples
#' ## Set up a within design with 2 factors, each with 2 levels,
#' ## with correlation between observations of 0.8,
#' ## 40 participants (who do all conditions), and standard deviation of 2
#' ## with a mean pattern of 1, 0, 1, 0, conditions labeled 'condition' and
#' ## 'voice', with names for levels of "cheerful", "sad", and "human", "robot"
#' ANOVA_design(design = "2w*2w", n = 40, mu = c(1, 0, 1, 0), sd = 2, r = 0.8,
#'       labelnames = c("condition", "cheerful", "sad", "voice", "human", "robot"))
#' @section References:
#' too be added
#' @importFrom stats pnorm pt qnorm qt as.formula median
#' @importFrom utils combn
#' @importFrom reshape2 melt
#' @importFrom MASS mvrnorm
#' @importFrom afex aov_car
#' @importFrom grDevices colorRampPalette
#' @import ggplot2
#' @export
#'

ANOVA_design <- function(design, n, mu, sd, r = 0, labelnames = NULL, plot = TRUE){

#Check String for an acceptable digits and factor (w or b)
  if (grepl("^(\\d{1,3}(w|b)\\*){0,2}\\d{1,3}(w|b)$", design, ignore.case = FALSE, perl = TRUE) == FALSE) {
    stop("Problem in the design argument: must input number of levels as integer (2-999) and factor-type (between or within) as lower case b (between) or w (within)")
  }

#Ensure sd is greater than 0
  if (any(sd <= 0)) {
    stop("Standard deviation (sd) is less than or equal to zero; input a value greater than zero")
  }

#Ensure, if single correlation is input, that it is between 0 and 1
  if (any(r < -1) | any(r >= 1) ) {
    stop("Correlation must be greater than -1 and less than 1")
  }

#Ensure proper n input
  if (length(n) != 1 ) {
    stop("Only balanced designs allowed: n can only be one value")
  }

  #If labelnames are not provided, they are generated.
  #Store factor levels (used many times in the script, calculate once)
  factor_levels <- as.numeric(strsplit(design, "\\D+")[[1]])

  if (is.null(labelnames)) {
    for(i1 in 1:length(factor_levels)){
      labelnames <- append(labelnames,paste(paste(letters[i1]), sep = ""))
      for(i2 in 1:factor_levels[i1]){
        labelnames <- append(labelnames,paste(paste(letters[i1]), paste(i2), sep = ""))
      }
    }
  }

  if (length(labelnames) != length(factor_levels) + sum(factor_levels)) {
    stop("Variable 'design' does not match the length of the labelnames")
  }

  ###############
  # 1. Specify Design and Simulation----
  ###############
  # String used to specify the design
  # Add numbers for each factor with 2 levels, e.g., 2 for a factor with 2 levels
  # Add a 'w' after the number for within factors, and a 'b' for between factors
  # Separate factors with a * (asterisk)
  # Thus "2b*3w) is a design with 2 between levels, and 3 within levels

  #Check if design and means match up - if not, throw an error and stop
  if(prod(factor_levels) != length(mu)){stop("the length of the vector with means does not match the study design")}

  #Check if the design and sd match (either 1 or length of design)
  #if(length(sd) != 1 && prod(factor_levels) != length(sd)){stop("The SD must be a length of 1 or match the length of the study design")}

  #Check if the factors are of an acceptable number of levels
  if(any(factor_levels <= 0) == TRUE | any(factor_levels > 99) ) {
    stop("Each factor can only have between 2 and 99 levels")
  }

  ###############
  # 2. Create Factors and Design ----
  ###############

  #Count number of factors in design
  factors <- length(factor_levels)

  #Get factor names and labelnameslist
  labelnames1 <- labelnames[(1 + 1):(1+factor_levels[1])]
  if(factors > 1){labelnames2 <- labelnames[(factor_levels[1] + 3):((factor_levels[1] + 3) + factor_levels[2] - 1)]}
  if(factors > 2){labelnames3 <- labelnames[(factor_levels[2] + factor_levels[1] + 4):((factor_levels[2] + factor_levels[1] + 4) + factor_levels[3] - 1)]}

  factornames1 <- labelnames[1]
  if(factors > 1){factornames2 <- labelnames[factor_levels[1] + 2]}
  if(factors > 2){factornames3 <- labelnames[factor_levels[2] + factor_levels[1] + 3]}

  if(factors == 1){labelnameslist <- list(labelnames1)}
  if(factors == 2){labelnameslist <- list(labelnames1,labelnames2)}
  if(factors == 3){labelnameslist <- list(labelnames1,labelnames2,labelnames3)}

  if(factors == 1){factornames <- c(factornames1)}
  if(factors == 2){factornames <- c(factornames1,factornames2)}
  if(factors == 3){factornames <- c(factornames1,factornames2,factornames3)}

  #Specify within/between factors in design: Factors that are within are 1, between 0
  design_factors <- strsplit(gsub("[^A-Za-z]","",design),"",fixed = TRUE)[[1]]
  design_factors <- as.numeric(design_factors == "w") #if within design, set value to 1, otherwise to 0

  #Specify design list (similar as below)
  xxx <- data.frame(matrix(NA, nrow = prod(factor_levels), ncol = 0))
  for(j in 1:factors){
    xxx <- cbind(xxx, as.factor(unlist(rep(as.list(paste(labelnameslist[[j]],
                                                         sep="_")),
                                           each = prod(factor_levels)/prod(factor_levels[1:j]),
                                           times = prod(factor_levels)/prod(factor_levels[j:factors])
    ))))
  }
  design_list <- as.character(interaction(xxx[, 1:factors], sep = "_")) #create a new condition variable combine 2 columns (interaction is a cool function!)

  ###############
  # 3. Create Correlation and Covariance Matrix ----
  ###############

  #Create empty matrix
  sigmatrix <- data.frame(matrix(ncol=length(mu), nrow = length(mu)))

  #NEW CODE, JUST FOR SINGLE correlation entry

  #single number
  cors <- r
  vars <- length(design_list)

  #Code by Lisa De Bruine. Allows multiple inputs for r - only use single value now.
  #from: https://github.com/debruine/faux/blob/master/R/rnorm_multi.R
  generate_cor_matrix  <- function(vars = 3, cors = 0, mu = 0, sd = 1) {
    if (length(mu) == 1) {
      mu <- rep(mu, vars)
    } else if (length(mu) != vars) {
      stop("the length of mu must be 1 or vars");
    }

    if (length(sd) == 1) {
      sd <- rep(sd, vars)
    } else if (length(sd) != vars) {
      stop("the length of sd must be 1 or vars");
    }

    # correlation matrix
    if (class(cors) == "numeric" & length(cors) == 1) {
      if (cors >=-1 & cors <=1) {
        cors = rep(cors, vars*(vars-1)/2)
      } else {
        stop("cors must be between -1 and 1")
      }
    }

    if (class(cors) == "matrix") {
      if (!is.numeric(cors)) {
        stop("cors matrix not numeric")
      } else if (dim(cors)[1] != vars || dim(cors)[2] != vars) {
        stop("cors matrix wrong dimensions")
      } else if (sum(cors == t(cors)) != (nrow(cors)^2)) {
        stop("cors matrix not symmetric")
      } else {
        cor_mat <- cors
      }
    } else if (length(cors) == vars*vars) {
      cor_mat <- matrix(cors, vars)
    } else if (length(cors) == vars*(vars-1)/2) {
      # generate full matrix from vector of upper right triangle

      cor_mat <- matrix(nrow=vars, ncol = vars)
      upcounter = 1
      lowcounter = 1
      for (col in 1:vars) {
        for (row in 1:vars) {
          if (row == col) {
            # diagonal
            cor_mat[row, col] = 1
          } else if (row > col) {
            # lower left triangle
            cor_mat[row, col] = cors[lowcounter]
            lowcounter <- lowcounter + 1
          }
        }
      }
      for (row in 1:vars) {
        for (col in 1:vars) {
          if (row < col) {
            # upper right triangle
            cor_mat[row, col] = cors[upcounter]
            upcounter <- upcounter + 1
          }
        }
      }
    }

    # check matrix is positive definite
    tol <- 1e-08
    ev <- eigen(cor_mat, only.values = TRUE)$values
    if (sum(ev < tol)) {
      stop("correlation matrix not positive definite")
    }

    return(cor_mat)
  }
  cor_mat <- generate_cor_matrix(vars = vars, cors = cors)

  sd_for_sigma <- sd #added to prevent changing sd which is now passed on
  if (length(sd_for_sigma) == 1) {
    sd_for_sigma <- rep(sd_for_sigma, vars)
  } else if (length(sd_for_sigma) != vars) {
    stop("the length of sd_for_sigma must be 1 or vars");
  }

  sigma <- (sd_for_sigma %*% t(sd_for_sigma)) * cor_mat #Our earlier code had a bug, with SD on the diagonal. Not correct! Thanks Lisa.

  #General approach: For each factor in the list of the design, save the first item (e.g., a1b1)
  #Then for each factor in the design, if 1, set number to wildcard
  i1<-1
  i2<-1
  for(i1 in 1:length(design_list)){
    design_list_split <- unlist(strsplit(design_list[i1],"_"))
    #current_factor <- design_list_split[c(2,4,6)[1:length(design)]] #this creates a string of 2, 2,4 or 2,4,6 depending on the length of the design for below
    for(i2 in 1:length(design_factors)){
      #We set each number that is within to a wildcard, so that all within participant factors are matched
      if(design_factors[i2]==1){design_list_split[i2] <- "\\w+"}
    }
    sigmatrix[i1,]<-as.numeric(grepl(paste0(design_list_split, collapse="_"), design_list)) # compare factors that match with current factor, given wildcard, save list to sigmatrix
  }

  #Now multiply the matrix we just created (that says what is within, and what is between,  with the original covariance matrix)
  #So factors manipulated within are correlated, those manipulated between are not.
  cor_mat <- sigmatrix*cor_mat
  sigmatrix <- sigma*sigmatrix
  row.names(sigmatrix) <- design_list
  colnames(sigmatrix) <- design_list
  row.names(cor_mat) <- design_list
  colnames(cor_mat) <- design_list

  ###############
  # 4. Create Dataframe based on Design ----
  ###############

  #Create the data frame. This will be re-used in the simulation (y variable is overwritten) but created only once to save time in the simulation
  dataframe <- as.data.frame(mvrnorm(n=n,
                              mu=mu,
                              Sigma=sigmatrix,
                              empirical = FALSE))
  dataframe$subject<-as.factor(c(1:n)) #create temp subject variable just for merging
  #Melt dataframe
  dataframe <- melt(dataframe,
             id.vars = "subject",
             variable.name = "cond",
             value.name = "y")

  # Let's break this down - it's a bit tricky. First, we want to create a list of labelnames that will indicate the factors.
  # We are looping this over the number of factors.
  # This: factor_levels - takes the string used to specify the design and turn it in a list.
  # we take the labelnames and factornames and combine them
  # We repeat these each: n*(2^(factors-1)*2)/(2^j) and them times:  (2^j/2) to get a list for each factor
  # We then bind these together with the existing dataframe.
  for(j in 1:factors){
    dataframe <- cbind(dataframe, as.factor(unlist(rep(as.list(paste(factornames[[j]],
                                                       labelnameslist[[j]],
                                                       sep="_")),
                                         each = n*prod(factor_levels)/prod(factor_levels[1:j]),
                                         times = prod(factor_levels)/prod(factor_levels[j:factors])
    ))))
  }
  #Rename the factor variables that were just created
  names(dataframe)[4:(3+factors)] <- factornames[1:factors]

  #Create subject column
  subject <- 1:n #Set subject to 1 to the number of subjects collected

  for(j2 in length(design_factors):1){ #for each factor in the design, from last to first
    #if w: repeat current string as often as the levels in the current factor (e.g., 3)
    #id b: repeat current string + max of current subject
    if(design_factors[j2] == 1){subject <- rep(subject,factor_levels[j2])}
    subject_length <- length(subject) #store current length - to append to string of this length below
    if(design_factors[j2] == 0){
      for(j3 in 2:factor_levels[j2]){
        subject <- append(subject,subject[1:subject_length]+max(subject))
      }
    }
  }

  #Overwrite subject columns in dataframe
  dataframe$subject <- subject
  #For the correlation matrix, we want the names of each possible comparison of means
  #Need to identify which columns from dataframe to pull the factor names from
  if (factors == 1) {
    cond_col <- c(4)
  } else if (factors == 2) {
    cond_col <- c(4, 5)
  } else {
    cond_col <- c(4, 5, 6)
  }

  dataframe$cond <- as.character(interaction(dataframe[, cond_col], sep = "_")) #create a new condition variable combine 2 columns (interaction is a cool function!)

  ###############
  # 5. Specify factors for formula ----
  ###############
  if(factors == 1 & sum(design_factors) == 1){frml1 <- as.formula(paste("y ~ ",factornames[1]," + Error(subject/",factornames[1],")",sep=""))}
  if(factors == 1 & sum(design_factors) == 0){frml1 <- as.formula(paste("y ~ ",factornames[1]," + Error(1 | subject)",sep=""))}

  if(factors == 2){
    if(sum(design_factors) == 2){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2]," + Error(subject/",factornames[1],"*",factornames[2],")"))}
    if(sum(design_factors) == 0){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"  + Error(1 | subject)"))}
    if(all(design_factors == c(1, 0)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2]," + Error(subject/",factornames[1],")"))}
    if(all(design_factors == c(0, 1)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2]," + Error(subject/",factornames[2],")"))}
  }

  if(factors == 3){
    if(sum(design_factors) == 3){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[1],"*",factornames[2],"*",factornames[3],")"))}
    if(sum(design_factors) == 0){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(1 | subject)"))}
    if(all(design_factors == c(1, 0, 0)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[1],")"))}
    if(all(design_factors == c(0, 1, 0)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[2],")"))}
    if(all(design_factors == c(0, 0, 1)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[3],")"))}
    if(all(design_factors == c(1, 1, 0)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[1],"*",factornames[2],")"))}
    if(all(design_factors == c(0, 1, 1)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[2],"*",factornames[3],")"))}
    if(all(design_factors == c(1, 0, 1)) == TRUE){frml1 <- as.formula(paste("y ~ ",factornames[1],"*",factornames[2],"*",factornames[3]," + Error(subject/",factornames[1],"*",factornames[3],")"))}
  }

  #Specify second formula used for plotting
  if(factors == 1){frml2 <- as.formula(paste("~",factornames[1]))}
  if(factors == 2){frml2 <- as.formula(paste("~",factornames[1],"+",factornames[2]))}
  if(factors == 3){frml2 <- as.formula(paste("~",factornames[1],"+",factornames[2],"+",factornames[3]))}

  ###############
  # 6. Create plot of means to visualize the design ----
  ###############

  dataframe_means <- data.frame(mu, sd)
  for(j in 1:factors){
    dataframe_means <- cbind(dataframe_means, as.factor(unlist(rep(as.list(paste(labelnameslist[[j]],
                                                                   sep="")),
                                                     each = prod(factor_levels)/prod(factor_levels[1:j]),
                                                     times = prod(factor_levels)/prod(factor_levels[j:factors])
    ))))
  }

  if(factors == 1){
    names(dataframe_means) <- c("mu","SD",factornames[1])
    dataframe_means[,factornames[1]] <- ordered(dataframe_means[,factornames[1]], levels = labelnameslist[[1]])
  }
  if(factors == 2){
    names(dataframe_means)<-c("mu","SD",factornames[1],factornames[2])
    dataframe_means[,factornames[1]] <- ordered(dataframe_means[,factornames[1]], levels = labelnameslist[[1]])
    dataframe_means[,factornames[2]] <- ordered(dataframe_means[,factornames[2]], levels = labelnameslist[[2]])
  }

  if(factors == 3) {
    names(dataframe_means) <- c("mu","SD",factornames[1],factornames[2],factornames[3])
    dataframe_means[,factornames[1]] <- ordered(dataframe_means[,factornames[1]], levels = labelnameslist[[1]])
    dataframe_means[,factornames[2]] <- ordered(dataframe_means[,factornames[2]], levels = labelnameslist[[2]])
    dataframe_means[,factornames[3]] <- ordered(dataframe_means[,factornames[3]], levels = labelnameslist[[3]])
  }

  if(factors == 1){meansplot = ggplot(dataframe_means, aes_string(y = "mu", x = factornames[1]))}
  if(factors == 2){meansplot = ggplot(dataframe_means, aes_string(y = "mu", x = factornames[1], colour = factornames[2]))}
  if(factors == 3){meansplot = ggplot(dataframe_means, aes_string(y = "mu", x = factornames[1], colour = factornames[2])) + facet_wrap(  paste("~",factornames[3],sep=""))}

  #Set custom color palette if factor 2 has a length greater than 8
  if (factors >= 2 && length(labelnameslist[[2]]) >= 9) {

    meansplot2 = meansplot +
      geom_point(position = position_dodge(width=0.9), shape = 10, size=5, stat="identity") + #Personal preference for sd -- ARC
      geom_errorbar(aes(ymin = mu-sd, ymax = mu+sd),
                    position = position_dodge(width=0.9), size=.6, width=.3) +
      coord_cartesian(ylim=c(min(mu)-sd, max(mu)+sd)) +
      theme_bw(base_size = 16) + ggtitle("Means for each condition in the design") +
     scale_colour_brewer(palette = "Dark2")

  } else {

    meansplot2 = meansplot +
      geom_point(position = position_dodge(width=0.9), shape = 10, size=5, stat="identity") + #Personal preference for sd -- ARC
      geom_errorbar(aes(ymin = mu-sd, ymax = mu+sd),
                    position = position_dodge(width=0.9), size=.6, width=.3) +
      coord_cartesian(ylim=c(min(mu)-sd, max(mu)+sd)) +
      theme_bw() + ggtitle("Means for each condition in the design") +
      scale_colour_brewer(palette = "Dark2")
  }
  if(plot == TRUE){
    print(meansplot2)  #should be blocked in Shiny context
  }

  # Return results in list()
  invisible(list(dataframe = dataframe,
                 design = design,
                 design_list = design_list,
                 factors = factors,
                 frml1 = frml1,
                 frml2 = frml2,
                 mu = mu,
                 sd = sd,
                 r = r,
                 n = n,
                 cor_mat = cor_mat,
                 sigmatrix = sigmatrix,
                 design_factors = design_factors,
                 labelnames = labelnames,
                 labelnameslist = labelnameslist,
                 factornames = factornames,
                 meansplot = meansplot2))
}