R/EFAGenData.R
In RGenData: Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

Documented in EFACompData FactorAnalysis GenDataPopulation GenDataSample

GenDataPopulation <- function(supplied.data, n.factors, n.cases, max.trials = 5,
                              initial.multiplier = 1, corr.type = "pearson",
                              seed = 0 ) {
# Simulates multivariate nonnormal data using an iterative algorithm
#
# Args:
#   supplied.data      : Data supplied by user
#   n.factors          : Number of factors (scalar).
#   n.cases            : Number of cases (scalar).
#   max.trials         : Maximum number of trials (scalar).
#   initial.multiplier : Value of initial multiplier (scalar).
#   corr.type          : Type of correlation (character, default is "pearson",
#                        user can also call "spearman").
#   seed               : seed value (scalar, default is 0).
#
# Returns:
#   data : Population of data
#
  set.seed(seed)
  n.variables <- dim(supplied.data)[2]
  data <- matrix(0, nrow = n.cases, ncol = n.variables)
  distributions <- matrix(0, nrow = n.cases, ncol = n.variables)
  iteration <- 0
  best.rmsr <- 1
  trials.without.improvement <- 0
  for (i in 1:n.variables) {
    if (min(supplied.data[, i]) == max(supplied.data[, i]))
      supplied.data[, i] <- supplied.data[, i] + rnorm(dim(supplied.data)[1], 0,
                           .0001)
    v <- FALSE
    while (!v) {
      distributions[, i] <- sort(sample(supplied.data[, i], size = n.cases,
                                replace = TRUE))
      v <- min(distributions[, i]) != max(distributions[, i])
    }
  }
  target.corr <- cor(supplied.data, method = corr.type)
  intermediate.corr <- target.corr
  shared.comp <- matrix(rnorm(n.cases * n.factors, 0, 1), nrow = n.cases,
                 ncol = n.factors)
  unique.comp <- matrix(rnorm(n.cases * n.variables, 0, 1), nrow = n.cases,
                 ncol = n.variables)
  shared.load <- matrix(0, nrow = n.variables, ncol = n.factors)
  unique.load <- matrix(0, nrow = n.variables, ncol = 1)
  while (trials.without.improvement < max.trials) {
    iteration <- iteration + 1
    factor.analysis <- FactorAnalysis(intermediate.corr, corr.matrix = TRUE,
                       max.iteration = 50, n.factors, corr.type)
    if (n.factors == 1) {
      shared.load[, 1] <- factor.analysis$loadings
    } else {
        for (i in 1:n.factors)
          shared.load[, i] <- factor.analysis$loadings[, i]
    }
      shared.load[shared.load > 1] <- 1
      shared.load[shared.load < -1] <- -1
      if (shared.load[1, 1] < 0)
        shared.load <- shared.load * -1
      for (i in 1:n.variables)
        if (sum(shared.load[i, ] * shared.load[i, ]) < 1) {
          unique.load[i, 1] <- (1 - sum(shared.load[i, ] * shared.load[i, ]))
        } else {
          unique.load[i, 1] <- 0
        }
      unique.load <- sqrt(unique.load)
      for (i in 1:n.variables)
        data[, i] <- (shared.comp %*% t(shared.load))[, i] + unique.comp[, i] *
                     unique.load[i, 1]
      for (i in 1:n.variables) {
        data <- data[sort.list(data[, i]), ]
        data[, i] <- distributions[, i]
      }
      reproduced.corr <- cor(data, method = corr.type)
      residual.corr <- target.corr - reproduced.corr
      rmsr <- sqrt(sum(residual.corr[lower.tri(residual.corr)] *
        residual.corr[lower.tri(residual.corr)]) /
          (.5 * (n.variables * n.variables - n.variables)))
      if (rmsr < best.rmsr) {
        best.rmsr <- rmsr
        best.corr <- intermediate.corr
        best.res <- residual.corr
        intermediate.corr <- intermediate.corr + initial.multiplier *
                             residual.corr
        trials.without.improvement <- 0
      } else {
        trials.without.improvement <- trials.without.improvement + 1
        current.multiplier <- initial.multiplier *
                              .5 ^ trials.without.improvement
        intermediate.corr <- best.corr + current.multiplier * best.res
      }
   }
   factor.analysis <- FactorAnalysis(best.corr, corr.matrix = TRUE,
                                     max.iteration = 50, n.factors,
                                     corr.type)
   if (n.factors == 1) {
     shared.load[, 1] <- factor.analysis$loadings
   } else {
     for (i in 1:n.factors)
        shared.load[, i] <- factor.analysis$loadings[, i]
   }
   shared.load[shared.load > 1] <- 1
   shared.load[shared.load < -1] <- -1
   if (shared.load[1, 1] < 0)
     shared.load <- shared.load * -1
   for (i in 1:n.variables)
     if (sum(shared.load[i, ] * shared.load[i, ]) < 1) {
       unique.load[i, 1] <- (1 - sum(shared.load[i, ] * shared.load[i, ]))
     } else {
       unique.load[i, 1] <- 0
     }
   unique.load <- sqrt(unique.load)
   for (i in 1:n.variables)
     data[, i] <- (shared.comp %*% t(shared.load))[, i] + unique.comp[, i] *
                  unique.load[i, 1]
   data <- apply(data, 2, scale) # standardizes each variable in the matrix
   for (i in 1:n.variables) {
     data <- data[sort.list(data[, i]), ]
     data[, i] <- distributions[, i]
   }
   return(data)
}
################################################################################
GenDataSample <- function(supplied.data, n.factors = 0, max.trials = 5,
                          initial.multiplier = 1, corr.type = "pearson",
                          seed = 0) {
# Bootstraps each variable's score distribution from a supplied data set.
#
# Args:
#   supplied.data      : Data supplied by user.
#   n.factors          : Number of factors (scalar, default is 0).
#   max.trials         : Maximum number of trials (scalar, default is 5).
#   initial.multipiler : Value of initial multiplier (scalar, default is 1).
#   corr.type          : Type of correlation (character, default is "pearson",
#                        user can also call "spearman").
#   seed               : seed value (scalar, default is 0).
#
# Returns:
#   data : Sample of data

  set.seed(seed)
  n.cases <- dim(supplied.data)[1]
  n.variables <- dim(supplied.data)[2]
  data <- matrix(0, nrow = n.cases, ncol = n.variables)
  distribution.matrix <- matrix(0, nrow = n.cases, ncol = n.variables)
  iteration.count <- 0
  best.rmsr.corr <- 1
  trial.counter <- 0
  if (seed != 0)
    set.seed(seed) # If user specified a nonzero seed, set it
  for (i in 1:n.variables)
    distribution.matrix[, i] <- sort(sample(supplied.data[, i], n.cases,
    replace = TRUE))
    # Modify this block of the program, as needed, to generate the desired
    # distribution(s).
  target.correlation <- cor(supplied.data)
  intermediate.correlation <- target.correlation
  if (n.factors == 0) {
    eigenvalues.observed <- eigen(intermediate.correlation)$values
    eigenvalues.random <- matrix(0, nrow = 100, ncol = n.variables)
    random.data <- matrix(0, nrow = n.cases, ncol = n.variables)
    for (i in 1:100) {
      for (j in 1:n.variables)
        random.data[, j] <- sample(distribution.matrix[, j], size = n.cases,
        replace = TRUE)
      eigenvalues.random[i, ] <- eigen(cor(random.data))$values
    }
    eigenvalues.random <- apply(eigenvalues.random, 2, mean)
    # calculate mean eigenvalue for each factor
    n.factors <- max(1, sum(eigenvalues.observed > eigenvalues.random))
  }
  shared.components <- matrix(rnorm(n.cases * n.factors, 0, 1), nrow = n.cases,
                                    ncol = n.factors)
  unique.components <- matrix(rnorm(n.cases * n.variables, 0, 1), nrow = n.cases,
                                    ncol = n.variables)
  shared.load <- matrix(0, nrow = n.variables, ncol = n.factors)
  unique.load <- matrix(0, nrow = n.variables, ncol = 1)
  while (trial.counter < max.trials) {
    iteration.count <- iteration.count + 1
    factor.analysis <- FactorAnalysis(intermediate.correlation,
                                      corr.matrix = TRUE, max.iteration = 50,
                                      n.factors, corr.type)
    if (n.factors == 1) {
      shared.load[, 1] <- factor.analysis$loadings
    } else {
      shared.load <- factor.analysis$loadings
    }
    shared.load[shared.load > 1] <- 1
    shared.load[shared.load < -1] <- -1
    if (shared.load[1, 1] < 0)
      shared.load <- shared.load * -1
    shared.load.sq <- shared.load * shared.load
    for (i in 1:n.variables)
      if (sum(shared.load.sq[i, ]) < 1) {
        unique.load[i, 1] <- (1 - sum(shared.load.sq[i, ]))
      } else {
        unique.load[i, 1] <- 0
      }
    unique.load <- sqrt(unique.load)
    for (i in 1:n.variables)
      data[, i] <- (shared.components %*% t(shared.load))[, i] +
                          unique.components[, i] * unique.load[i, 1]
    for (i in 1:n.variables) {
      data <- data[sort.list(data[, i]), ]
      data[, i] <- distribution.matrix[, i]
    }
    reproduced.corr <- cor(data)
    residual.corr <- target.correlation - reproduced.corr
    rmsr <- sqrt(sum(residual.corr[lower.tri(residual.corr)] *
                     residual.corr[lower.tri(residual.corr)]) /
                     (.5 * (n.variables * n.variables - n.variables)))
    if (rmsr < best.rmsr.corr) {
      best.rmsr.corr <- rmsr
      best.corr <- intermediate.correlation
      best.res <- residual.corr
      intermediate.correlation <- intermediate.correlation +
                                  initial.multiplier * residual.corr
      trial.counter <- 0
    } else {
      trial.counter <- trial.counter + 1
      current.multiplier <- initial.multiplier * .5 ^ trial.counter
      intermediate.correlation <- best.corr + current.multiplier * best.res
    }
  }
  factor.analysis <- FactorAnalysis(best.corr, corr.matrix = TRUE,
                                    max.iteration = 50, n.factors,
                                    corr.type)
  if (n.factors == 1) {
    shared.load[, 1] <- factor.analysis$loadings
  } else {
    shared.load <- factor.analysis$loadings
  }
  shared.load[shared.load > 1] <- 1
  shared.load[shared.load < -1] <- -1
  if (shared.load[1, 1] < 0)
    shared.load <- shared.load * -1
  shared.load.sq <- shared.load * shared.load
  for (i in 1:n.variables)
    if (sum(shared.load.sq[i, ]) < 1) {
      unique.load[i, 1] <- (1 - sum(shared.load.sq[i, ]))
    } else {
      unique.load[i, 1] <- 0
  }
  unique.load <- sqrt(unique.load)
  for (i in 1:n.variables)
    data[, i] <- (shared.components %*% t(shared.load))[, i] +
                        unique.components[, i] * unique.load[i, 1]
  data <- apply(data, 2, scale)
  for (i in 1:n.variables) {
    data <- data[sort.list(data[, i]), ]
    data[, i] <- distribution.matrix[, i]
  }
  data <- data[sample(1:n.cases, n.cases, replace = FALSE), ]
  return(data)
}
################################################################################
FactorAnalysis <- function(data, corr.matrix = FALSE, max.iteration = 50,
                            n.factors = 0, corr.type = "pearson") {
# Analyzes comparison data with known factorial structures
#
# Args:
#   data          : Matrix to store the simulated data.
#   corr.matrix   : Correlation matrix (default is FALSE)
#   max.iteration : Maximum number of iterations (scalar, default is 50).
#   n.factors     : Number of factors (scalar, default is 0).
#   corr.type     : Type of correlation (character, default is "pearson",
#                   user can also call "spearman").
#
# Returns:
#   $loadings : Factor loadings (vector, if one factor. matrix, if multiple
#               factors)
#   $factors  : Number of factors (scalar).
#
  data <- as.matrix(data)
  n.variables <- dim(data)[2]
  if (n.factors == 0) {
    n.factors <- n.variables
    determine <- TRUE
  } else {
    determine <- FALSE
  }
  if (!corr.matrix) {
    corr.matrix <- cor(data, method = corr.type)
  } else {
    corr.matrix <- data
  }
  criterion <- .001
  old.h2 <- rep(99, n.variables)
  h2 <- rep(0, n.variables)
  change <- 1
  iteration <- 0
  factor.loadings <- matrix(nrow = n.variables, ncol = n.factors)
  while ((change >= criterion) & (iteration < max.iteration)) {
    iteration <- iteration + 1
    eigenvalue <- eigen(corr.matrix)
    l <- sqrt(eigenvalue$values[1:n.factors])
    for (i in 1:n.factors)
      factor.loadings[, i] <- eigenvalue$vectors[, i] * l[i]
    for (i in 1:n.variables)
      h2[i] <- sum(factor.loadings[i, ] * factor.loadings[i, ])
    change <- max(abs(old.h2 - h2))
    old.h2 <- h2
    diag(corr.matrix) <- h2
  }
  if (determine) n.factors <- sum(eigenvalue$values > 1)
  return(list(loadings = factor.loadings[, 1:n.factors],
              factors = n.factors))
}

################################################################################

EFACompData <- function(data, f.max, n.pop = 10000, n.samples = 500,
                        alpha = .30, graph = FALSE, corr.type = "pearson") {
# Comparison data
#
# Args:
#   data      : Matrix to store the simulated data (matrix).
#   f.max     : Largest number of factors to consider (scalar).
#   n.pop     : Size of finite populations of comparison data (scalar, default
#               is 10,000 cases).
#   n.samples : Number of samples drawn from each population (scalar, default is
#               500)
#   alpha     : Alpha level when testing statistical significance of improvement
#               with additional factor (scalar, default is .30)
#   graph     : Whether to plot the fit of eigenvalues to those for comparison
#               data (default is FALSE)
#   corr.type : Type of correlation (character, default is "pearson", user can
#               also call "spearman").
#
# Returns:
#   Nothing, displays number of factors on screen.
#
  n <- dim(data)[1]
  n.variables <- dim(data)[2]
  corr.data <- cor(data, method = corr.type)
  eigs.data <- eigen(corr.data)$values
  rmsr.eigs <- matrix(0, nrow = n.samples, ncol = f.max)
  sig <- TRUE
  f.cd <- 1
  while ((f.cd <= f.max) & (sig)) {
    pop <- GenDataPopulation (data, n.factors = f.cd, n.cases = n.pop,
                              corr.type = corr.type)
    for (j in 1:n.samples) {
      v <- 0
      while (v == 0) {
        samp <- pop[sample(1:n.pop, size = n, replace = T), ]
        for (i in 1:n.variables)
          v <- v + min(samp[, i]) != max(samp[, i])
      }
      corr.samp <- cor(samp, method = corr.type)
      eigs.samp <- eigen(corr.samp)$values
      rmsr.eigs[j, f.cd] <- sqrt(sum((eigs.samp - eigs.data) *
                                (eigs.samp - eigs.data)) / n.variables)
    }
    if (f.cd > 1)
      sig <- (wilcox.test(rmsr.eigs[, f.cd], rmsr.eigs[, (f.cd - 1)],
              "less")$p.value < alpha)
    if (sig)
      f.cd <- f.cd + 1
  }
  cat("Number of factors to retain: ", f.cd - 1, "\n")
  if (graph) {
    if (sig) {
      x.max <- f.cd - 1
    } else {
      x.max <- f.cd
    }
    ys <- apply(rmsr.eigs[, 1:x.max], 2, mean)
    plot(x = 1:x.max, y = ys, ylim = c(0, max(ys)), xlab = "Factor",
         ylab = "RMSR Eigenvalue", type = "b",
         main = "Fit to Comparison Data")
    abline(v = f.cd - 1, lty = 3)
  }
}

Any scripts or data that you put into this service are public.

RGenData documentation built on May 2, 2019, 2:47 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

RGenData
Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

R/EFAGenData.R
In RGenData: Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

Defines functions GenDataPopulation GenDataSample FactorAnalysis EFACompData

Documented in EFACompData FactorAnalysis GenDataPopulation GenDataSample

Try the RGenData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RGenData Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

R/EFAGenData.R In RGenData: Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

Defines functions GenDataPopulation GenDataSample FactorAnalysis EFACompData

Documented in EFACompData FactorAnalysis GenDataPopulation GenDataSample

Try the RGenData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RGenData
Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain

R/EFAGenData.R
In RGenData: Generates Multivariate Nonnormal Data and Determines How Many Factors to Retain