R/GenerateData.R

#' Generate a multilevel long-form data frame
#'
#' @description This function builds a long-form data set with \code{n} 
#' observations from \code{j} units. 
#' 
#' @details The function creates a data frame containing variables \code{id}, 
#'   \code{y}, and \code{V#} with the number of fixed effects. The user can 
#'   specify the number of level 2 covariates. The data are generated in two 
#'   steps, first the level 2 data are generated, which are constant per unit
#'   then the ordering of units is created using the \code{sample} function, 
#'   with \code{replace = TRUE}. This results in unequal number of observations 
#'   per unit, altough each unit has equal probabilty of being sampled. Lastly, 
#'   level 1 data are generated by drawing from the normal distribution. 
#'
#' @param n Number of observations.
#' @param j Number of individuals.
#' @param fixed_coef Vector with fixed effects coefficients.
#' @param random_coef_sd Vector with standard deviations of the random effects.
#' @param resid_sd Scalar, residual variance.
#' @param n_level_2_var Number of level 2 variables.
#' @param mean_fixed_level_2 Means of the fixed effects covariates, level 2.
#' @param sd_fixed_level_2 Standard Deviation of the fixed effects covariates,
#'   level 2.
#' @param mean_fixed_level_1 Means of the fixed effects covariates, level 1.
#' @param sd_fixed_level_1 Standard deviation of the fixed effects covariates,
#'   level 1.
#' @keywords  multilevel dataset
#' @export
#' @examples
#' ## We create a dataset, consisting of 2500 observations from 20 
#' ## units. The fixed effects have the coefficients 1, 2, 3, 4, and 5. The 
#' ## variance of the random effects equals 1, 4, and 9. Lastly the 
#' ## residual variance equals 4:
#'   
#' test_data <- build_dataset(n = 2500, 
#'                            j = 20, 
#'                            fixed_coef = 1:5, 
#'                            random_coef_sd = 1:3, 
#'                            resid_sd = 2)
#' @return A data frame with variable \code{id}, which labels the units,
#'   \code{y} is the outcome or dependent variable and covariates.

build_dataset <- function(n,
                          j,
                          fixed_coef,
                          random_coef_sd,
                          resid_sd,
                          n_level_2_var = 2,
                          mean_fixed_level_2 = 0,
                          sd_fixed_level_2 = 1,
                          mean_fixed_level_1 = 0,
                          sd_fixed_level_1 = 1){
  level_2_data_j <- build_person_data(j,
                                      n_level_2_var,
                                      mean_fixed_level_2,
                                      sd_fixed_level_2)
  coef_data      <- cbind(matrix(rep(fixed_coef, each = j), nrow = j),
                          build_coef_data(j, random_coef_sd))

  n_fixed_var    <- length(fixed_coef)
  n_random_var   <- length(random_coef_sd)

  n_level_1		   <- n_fixed_var - n_level_2_var

  ids            <- sample(1:j, n, replace = TRUE)
  coef_dataset   <- coef_data[ids, ]
  level_2_data   <- level_2_data_j[ids, ]

  dataset        <- as.data.frame(cbind(id = ids, level_2_data[, -1]))
  if(n_level_1 > 0){
    temp	         <- matrix(nrow = n, ncol = n_level_1)

    if(length(mean_fixed_level_1) == 1){ 
      mean_fixed_level_1 <- rep(mean_fixed_level_1, n_level_1)
      sd_fixed_level_1 <- rep(sd_fixed_level_1, n_level_1)
    }
      for(t in 1:n_level_1){
        dataset[, (1 + dim(dataset)[2])] <- temp[, t]	<- stats::rnorm(
                                                         n,
                                                         mean_fixed_level_1[t],
                                                         sd_fixed_level_1[t])
      }
      dataset	  <- cbind(dataset, "z0" = 1, temp[, 1:(n_random_var - 1)])
  }
    dataset$y	<- rowSums(dataset[, -1] * coef_dataset) + stats::rnorm(n, 0, 
                                                                      resid_sd)
    dataset   <- dataset[, c(1, dim(dataset)[2], 2:(1 + n_fixed_var))]
    return(dataset)
}

#' build_person_data is a function to create level 2 data.
#' @param j Number of units.
#' @param n_level_2_var Number of level 2 variables.
#' @param mean_fixed_level_2  A vector with means for the level 2 covariates.
#' @param sd_fixed_level_2 A vector with standard deviations for the level 2
#'   covariates.
#' @return A data frame with level 2 data for the units.

build_person_data	<- function(j,
                              n_level_2_var,
                              mean_fixed_level_2,
                              sd_fixed_level_2){
  person_data		      <- matrix(NA, nrow = j, ncol = (1 + n_level_2_var))
  person_data[, 1:2] 	<- cbind(1:j, 1)
  if(n_level_2_var == 1){
    return(person_data)
  }
  else{
    for(t in 1:(n_level_2_var - 1)){
      person_data[, t + 2] <- stats::rnorm(j,
                                    mean_fixed_level_2[t],
                                    sd_fixed_level_2[t])
    }
  }
  return(person_data)
}

#' build_coef_data creates a data set with random coefficients per unit.
#' @param j Number of units.
#' @param random_coef_sd A vector with true variance of the random effects.
#' @return A data frame random coefficients for all random effects for
#'   all units.

build_coef_data	<- function(j,
                            random_coef_sd){
  coef_random <- matrix(NA, nrow = j, ncol = length(random_coef_sd))
  for(i in 1:length(random_coef_sd)){
    coef_random[, i] <- stats::rnorm(j, 0, random_coef_sd[i])
  }
  return(coef_random)
}
L-Ippel/SEMA documentation built on May 30, 2019, 8:23 a.m.