R/PCAplots.R

Defines functions pcaoutliers scree.boot triplot

Documented in pcaoutliers scree.boot triplot

#' PCA biplots with ggplot2
#'
#' @param fit             object types supported from this package: "PrincipalComp", "sdr", "gpls", "gpcr", "ldarob", "facanal". \cr \cr
#                         objects from other packages that are supported: "prcomp" (stats), "princomp" (stats), "principal" (psych), "PCA" (FactoMineR),
#'                        "Pca" (rrcov), "PcaRobust" (rrcov), "mvPCA" (MNM), "ics" (ICS), "factanal" (stats), "fa" (psych), "lda" (MASS)
#' @param choices         a vector of length 2 indicating which components to plot. defaults to the first two, ie c(1, 2).
#' @param scale           covariance biplot (scale = 1), form biplot (scale = 0). when scale = 1, the inner product between the variables approximates the covariance and the distance between the points approximates the Mahalanobis distance.
#' @param obs.scale       scale factor to apply to observations
#' @param var.scale       scale factor to apply to variables
#' @param pc.biplot       for compatibility with biplot.princomp()
#' @param coord_equal     should the coordinate system be set to maintain a symmetric shaped plot? defaults to FALSE.
#' @param groups          optional factor variable indicating the groups that the observations belong to. if provided the points will be colored according to groups
#' @param ellipse         draw a gaussian data ellipse for each group?
#' @param ellipse.prob    size of the ellipse in gaussian probability
#' @param kde             if TRUE, plots the kernel density estimate contours instead of points. useful for dense data. defaults to FALSE.
#' @param labels          optional vector of labels for the observations
#' @param labels.size     size of the text used for the labels
#' @param alpha           alpha transparency value for the points (0 = transparent, 1 = opaque)
#' @param circle          draw a correlation circle? (only applies when prcomp was called with scale = TRUE and when var.scale = 1)
#' @param var.axes        draw arrows for the variables?
#' @param var.names       should labels be drawn for the variable axes?
#' @param varname.size    size of the text for variable names
#' @param varname.adjust  adjustment factor the placement of the variable names, >= 1 means farther from the arrow
#' @param varname.abbrev  whether or not to abbreviate the variable names
#' @param lwd             the size of the variable axes. defaults to 1.25.
#' @param klwd            the thickness of the kde contours if kde=TRUE. defaults to 1.5.
#'
#' @return                a ggplot2 plot
#' @export
#' @examples
#'   data(wine)
#'   wine.pca <- PrincipalComp(wine, ncomp = 3)
#'   print(ggBiplot(wine.pca, groups = wine.class, ellipse = TRUE))
#'

ggBiplot = function (fit, choices = c(1, 2), groups = NULL, pc.biplot = T, coord_equal = F,
                     ellipse = F, ellipse.prob = 0.90, kde = F, circle = F, circle.prob = 0.90,
                     var.axes = T, var.names = T, varname.size = 4.125, varname.adjust = 1, varname.abbrev = T,
                     labels = NULL, labels.size = 3, scale = 1, obs.scale = 1 - scale, var.scale = scale, alpha = 0.75, lwd = 1.25, klwd = 1.5,...)
{
  suppressPackageStartupMessages(require(ggplot2))
  suppressPackageStartupMessages(require(plyr))
  suppressPackageStartupMessages(require(scales))
  suppressPackageStartupMessages(require(grid))

  gg_color <- function(n) {
    hues = seq(22, 355, length = n + 1)
    colorspace::darken(hcl(h = hues, l = 80, c = 100)[1:n], 0.40, space = "HCL")
  }

  stopifnot(length(choices) == 2)
  if (inherits(fit, "prcomp")) {
    nobs.factor <- sqrt(nrow(fit$x) - 1)
    d <- fit$sdev
    u <- sweep(fit$x, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$rotation
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "princomp")) {
    nobs.factor <- sqrt(fit$n.obs)
    d <- fit$sdev
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "PrincipalComp")) {
    if (attr(fit, "model")== "Sparse PCA" || attr(fit, "model") == "Robust Sparse PCA"){
      nobs.factor <- sqrt(nrow(fit$scores))
      d <- sqrt(fit$values)
      u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
      v <- fit$loadings
      v <- fit$loadings[-which(rowSums(v[,choices]) == 0), ]
      d.total <- sum(d^2)
    }
    else{
      nobs.factor <- sqrt(nrow(fit$scores))
      d <- sqrt(fit$values)
      u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
      v <- fit$loadings
      d.total <- sum(d^2)
    }
  }
  else if (inherits(fit, "mvPCA")) {
    nobs.factor <- sqrt(nrow(fit$scores))
    d <- sqrt(fit$EigenV)
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "ics")){
    nobs.factor <- sqrt(nrow(fit@Scores))
    d <- sqrt(fit@gKurt)
    u <- sweep(fit@Scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit@UnMix; rownames(v) <- fit@DataNames; colnames(v) <- colnames(fit@Scores)
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "PCA")) {
    nobs.factor <- sqrt(nrow(fit$call$X))
    d <- unlist(sqrt(fit$eig)[1])
    u <- sweep(fit$ind$coord, 2, 1/(d * nobs.factor), FUN = "*")
    v <- sweep(fit$var$coord, 2, sqrt(fit$eig[1:ncol(fit$var$coord), 1]), FUN = "/")
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "Pca")){
    nobs.factor <- sqrt(fit@n.obs)
    d <- sqrt(fit@eigenvalues)
    u <- sweep(fit@scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit@loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "lda")) {
    nobs.factor <- sqrt(fit$N)
    d <- fit$svd
    u <- predict(fit)$x/nobs.factor
    v <- fit$scaling
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "ldarob")) {
    nobs.factor <- sqrt(fit$N)
    d <- fit$svd
    u <- predict(fit, type = "LD")/nobs.factor
    v <- fit$scaling
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "sdr")) {
    nobs.factor <- sqrt(length(fit$y.obs))
    d <- svd(fit$predictors)$d^0.50
    u <- sweep(fit$predictors, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$basis
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "gpls")) {
    nobs.factor <- sqrt(length(fit$y_obs))
    d <- svd(fit$factor.scores)$d^0.50
    u <- sweep(fit$factor.scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$factor.loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "gpcr")) {
    nobs.factor <- sqrt(length(fit$linear.predictor))
    d <- svd(fit$scores)$d^0.50
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "factanal")) {
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- svd(fit$scores)$d^0.50
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "facanal")) {
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- svd(fit$scores)$d^0.50
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (any(class(fit) == "fa") && any(class(fit) == "psych")){
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- sqrt(fit$e.values[1:ncol(fit$scores)])
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (any(class(fit) == "principal") && any(class(fit) == "psych")){
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- sqrt(fit$values[1:ncol(fit$scores)])
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else {
    stop("No compatible class detected.")
  }

  choices <- pmin(choices, ncol(u))
  df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, FUN = "*"))
  v <- sweep(v, 2, d^var.scale, FUN = "*")
  df.v <- as.data.frame(v[, choices])
  names(df.u) <- c("xvar", "yvar")
  names(df.v) <- names(df.u)
  if (pc.biplot) {
    df.u <- df.u * nobs.factor
  }
  r <- sqrt(qchisq(circle.prob, df = 4)) * prod(colMeans(df.u^2))^(1/4)
  v.scale <- rowSums(v^2)
  df.v <- r * df.v/sqrt(max(v.scale))

  if (inherits(fit, "princomp") || inherits(fit, "PCA") || inherits(fit, "prcomp") || any(class(fit) == "principal") || inherits(fit, "mvPCA") || inherits(fit, "Pca")) {
    u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
  }
  else if (inherits(fit, "PrincipalComp")){
    if (attr(fit, "model") == "Curvilinear PCA"){
      u.axis.labs <- paste0("Curvilinear Component ", choices, sep = " ")
    }
    else if (attr(fit, "model") == "Kernel PCA"){
      u.axis.labs <- paste0("Kernel Component ", choices, sep = " ")
    }
    else if (attr(fit, "model") == "Sample Dependent LPP"){
      u.axis.labs <- paste0("Dimension ", choices, sep = " ")
    }
    else{
      if (attr(fit, "rotation") != "none"){
        u.axis.labs <- paste0("Rotated Component ", choices, sep = " ")
      } else {
        u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
      }
    }
  }
  else if (inherits(fit, "ics")){
    u.axis.labs <- paste0("Independent Component ", choices, sep = " ")
  }
  else if (inherits(fit, "lda") || inherits(fit, "ldarob")){
    u.axis.labs <- paste("Linear Discriminant ", choices, sep = " ")
  }
  else if (inherits(fit, "sdr")){
    u.axis.labs <- paste0("Sufficient Predictor ", choices, sep = " ")
  }
  else if (inherits(fit, "gpls")){
    u.axis.labs <- paste0("Factor ", choices, sep = " ")
  }
  else if (inherits(fit, "gpcr")){
    u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
  }
  else if (inherits(fit, "factanal") || any(class(fit) == "fa") || class(fit) == "facanal"){
    u.axis.labs <- paste0("Factor ", choices, sep = " ")
  }
  if (!is.null(labels)) {
    df.u$labels <- labels
  }
  if (!is.null(groups)) {
    df.u$groups <- groups
  }
  if (varname.abbrev) {
    df.v$varname <- abbreviate(rownames(v), named = F, method = 'both.sides', minlength = 3)
  }
  else {
    df.v$varname <- rownames(v)
  }
  df.v$angle <- with(df.v, (180/pi) * atan(yvar/xvar))
  df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar))/2)
  g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) +
    xlab(u.axis.labs[1]) +
    ylab(u.axis.labs[2]) +
    expand_limits(x = grDevices::extendrange(r = range(df.u$xvar), f = 0.05),
                  y = grDevices::extendrange(r = range(df.u$yvar), f = 0.05))
  if (coord_equal){
    g <- g + coord_equal()
  }

  ## Options for Kernel Density Estimates are desired in lieu of points
  if (kde) {
    if (!is.null(df.u$groups)) {
      g <- g + stat_density_2d(aes(color = groups), alpha = max(0.20, alpha * 0.9125), size = klwd)
    }
    else {
      g <- g + stat_density_2d(alpha = max(0.20, alpha * 0.85), color = "#26ad41", size = klwd)
    }
  }
  else {
    if (!is.null(df.u$groups)) {
      g <- g + geom_point(aes(color = groups, fill = groups), alpha = max(0.10, alpha * .65), shape = 21, size = 2.5) +
        scale_fill_manual(values = gg_color(length(unique(df.u$groups))), aesthetics = "fill")
    }
    else {
      g <- g + geom_point(alpha = max(0.10, alpha * 0.95), color = "#d99800", fill = "#e09a0d", shape = 21,  size = 2.5)
    }
  }

  ## Options for when variable axes (arrows) are enabled as well
  ## as when the circle is wanted without the arrows
  if (var.axes) {
    if (circle) {
      theta <- c(seq(-pi, pi, length = 75), seq(pi, -pi,length = 75))
      circle <- data.frame(xvar = r * cos(theta), yvar = r *sin(theta))
      g <- g + geom_path(data = circle, color = "#001412", size = 0.50, alpha = 0.285)
    }
    g <- g + geom_segment(data = df.v,
                          aes(x = 0,
                              y = 0,
                              xend = xvar * 0.95,
                              yend = yvar * 0.95,
                              alpha = 0.75 * alpha),
                          arrow = arrow(length = unit(2/3, "picas")),
                          size = lwd,
                          color = "#1063ff",
                          alpha = 0.75 * alpha)
  }
  else if (circle && !var.axes){
      theta <- c(seq(-pi, pi, length = 100), seq(pi, -pi,length = 100))
      circle <- data.frame(xvar = r * cos(theta), yvar = r *sin(theta))
      g <- g + geom_path(data = circle, color = "#a81c07", size = 0.60, alpha = 0.286)
  }


  ## Labels for Individual Observations
  if (!is.null(df.u$labels)) {
    if (!is.null(df.u$groups)) {
      g <- g + geom_text(aes(label = labels, color = groups), size = labels.size)
    }
    else {
      g <- g + geom_text(aes(label = labels), size = labels.size)
    }
  }

  ## Ellipse options when there are group IDs supplied
  if (!is.null(df.u$groups) && ellipse) {
    theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
    circle <- cbind(cos(theta), sin(theta))
    ell <- ddply(df.u, "groups", function(x) {
      if (nrow(x) <= 2) {
        return(NULL)
      }
      sigma <- var(cbind(x$xvar, x$yvar))
      mu <- c(mean(x$xvar), mean(x$yvar))
      ed <- sqrt(qchisq(ellipse.prob, df = 2))
      data.frame(sweep(circle %*% chol(sigma) * ed, 2,
                       mu, FUN = "+"), groups = x$groups[1])
    })
    names(ell)[1:2] <- c("xvar", "yvar")
    g <- g + geom_path(data = ell, aes(color = groups, group = groups))
  }

  ## Add variable axis labels last so that they are readable
  if (var.axes && var.names) {
    g <- g + geom_text(data = df.v,
                       aes(label = varname,
                           x = xvar, y = yvar,
                           angle = angle,
                           hjust = hjust,
                           fontface = "bold"),
                       color = "#eb009c",
                       size = varname.size,
                       alpha = 0.85)
  }
  return(g)
}

#' triplot: 3D biplots for PCA and more
#'
#' @param fit the model fit
#' @param choices the components to plot
#' @param groups optional group labels
#' @param scale  covariance biplot (scale = 1), form biplot (scale = 0). when scale = 1, the inner product between the variables approximates the covariance and the distance between the points approximates the Mahalanobis distance.
#' @param obs.scale scale factor to apply to observations
#' @param var.scale scale factor to apply to variables
#' @param var.axes should loading arrows be drawn? defaults to TRUE.
#' @param var.labels should the arrows be labeled? defaults to TRUE.
#' @param pc.biplot for compatibility with biplot.princomp()
#' @param ellipse draw a gaussian data ellipse for each group?
#' @param ellipse.prob size of the ellipse in gaussian probability
#' @param shape one of "p" for 2 dimensional points, or "s" for spheres. if left as NULL, "s" is used for smaller sample sizes (N < 250)
#' and "p" is used otherwise to ease computational load on computers without a dedicated GPU.
#' @param size a vector of length 2 of sizes for the data. defaults to 1 for shperes and 4 for points. if only one number is provided, be sure to select which shape it goes with and don't leave shape as NULL.
#'
#' @return an rgl plot
#' @export
#'
triplot = function(fit, choices = c(1,2,3), groups = NULL, ellipse = F, ellipse.prob = 0.95, scale = 1, pc.biplot = T,
                   var.axes = T, var.names = T, obs.scale = 1 - scale, var.scale = scale, shape = NULL, size = c(s=1, p=4)){


  gg_color <- function(n) {
    hues = seq(14, 356, length = n + 1)
    colorspace::darken(hcl(h = hues, l = 60, c = 120)[1:n], 0.10, space = "HCL")
  }

  get_colors <- function(groups, group.col = palette()){
    groups <- as.factor(groups)
    ngrps <- length(levels(groups))
    if(ngrps > length(group.col))
      group.col <- rep(group.col, ngrps)
    color <- group.col[as.numeric(groups)]
    names(color) <- as.vector(groups)
    return(color)
  }


  if (inherits(fit, "prcomp")) {
    nobs.factor <- sqrt(nrow(fit$x) - 1)
    d <- fit$sdev
    u <- sweep(fit$x, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$rotation
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "princomp")) {
    nobs.factor <- sqrt(fit$n.obs)
    d <- fit$sdev
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "PrincipalComp")) {
    if (attr(fit, "model")== "Sparse PCA" || attr(fit, "model") == "Robust Sparse PCA"){
      nobs.factor <- sqrt(nrow(fit$scores))
      d <- sqrt(fit$values)
      u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
      v <- fit$loadings
      v <- fit$loadings[-which(rowSums(v[,choices]) == 0), ]
      d.total <- sum(d^2)
    }
    else{
      nobs.factor <- sqrt(nrow(fit$scores))
      d <- sqrt(fit$values)
      u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
      v <- fit$loadings
      d.total <- sum(d^2)
    }
  }
  else if (inherits(fit, "mvPCA")) {
    nobs.factor <- sqrt(nrow(fit$scores))
    d <- sqrt(fit$EigenV)
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "PCA")) {
    nobs.factor <- sqrt(nrow(fit$call$X))
    d <- unlist(sqrt(fit$eig)[1])
    u <- sweep(fit$ind$coord, 2, 1/(d * nobs.factor), FUN = "*")
    v <- sweep(fit$var$coord, 2, sqrt(fit$eig[1:ncol(fit$var$coord), 1]), FUN = "/")
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "Pca")){
    nobs.factor <- sqrt(fit@n.obs)
    d <- sqrt(fit@eigenvalues)
    u <- sweep(fit@scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit@loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "ics")){
    nobs.factor <- sqrt(nrow(fit@Scores))
    d <- sqrt(fit@gKurt)
    u <- sweep(fit@Scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit@UnMix
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "lda")) {
    nobs.factor <- sqrt(fit$N)
    d <- fit$svd
    u <- predict(fit)$x/nobs.factor
    v <- fit$scaling
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "ldarob")) {
    nobs.factor <- sqrt(fit$N)
    d <- fit$svd
    u <- predict(fit, type = "LD")/nobs.factor
    v <- fit$scaling
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "sdr")) {
    nobs.factor <- sqrt(length(fit$y.obs))
    d <- svd(fit$predictors)$d^0.50
    u <- sweep(fit$predictors, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$basis
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "gpls")) {
    nobs.factor <- sqrt(length(fit$y_obs))
    d <- svd(fit$factor.scores)$d^0.50
    u <- sweep(fit$factor.scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$factor.loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "gpcr")) {
    nobs.factor <- sqrt(length(fit$linear.predictor))
    d <- svd(fit$scores)$d^0.50
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (inherits(fit, "factanal") || class(fit) == "facanal") {
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- svd(fit$scores)$d^0.50
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (any(class(fit) == "fa") && any(class(fit) == "psych")){
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- sqrt(fit$e.values[1:ncol(fit$scores)])
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else if (any(class(fit) == "principal") && any(class(fit) == "psych")){
    nobs.factor <- sqrt(length(fit$n.obs))
    d <- sqrt(fit$values[1:ncol(fit$scores)])
    u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
    v <- fit$loadings
    d.total <- sum(d^2)
  }
  else {
    stop("No compatible class detected.")
  }

  choices <- pmin(choices, ncol(u))
  df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, FUN = "*"))
  v <- sweep(v, 2, d^var.scale, FUN = "*")
  df.v <- as.data.frame(v[, choices])
  if (pc.biplot) {
    df.u <- df.u * nobs.factor
  }
  r <- sqrt(qchisq(0.95, df = 4)) * prod(colMeans(df.u^2))^(1/4)
  v.scale <- rowSums(v^2)
  df.v <- r * df.v/sqrt(max(v.scale))
  scores <- df.u
  rotation <- df.v
  rownames(rotation) <- abbreviate(rownames(rotation), named = F, method = 'both.sides', minlength = 2)
  if (is.null(shape)) {
    if (nobs.factor < 251){
      shape = "s"
      size = 1
      alpha = 1
    } else {
      shape = "p"
      size = 4
      alpha = 0.95
    }
  }
  else if (!is.null(shape)){
    if (length(size) == 2){
      size <- switch(shape,
                     "s" = size[1],
                     "p" = size[2])
      alpha = 0.60
    }
    else{
      size <- size
      alpha = 0.60
    }
  }

  if (is.null(groups)){
    colors = "#837287"
  }
  else {
    colors = get_colors(groups, gg_color(length(unique(groups))))
  }

  rgl::plot3d(scores, col = colors, type = shape, size = size, alpha = alpha)

  if (ellipse == TRUE) {
    if (is.null(groups)){
      covmat <- list()
        covmat$cov <- cov(scores)
      covmat$center <- cvreg::colMedians(scores)
      rgl::wire3d(rgl::ellipse3d(x = as.matrix(covmat$cov), centre=c(covmat$center), smooth = F, level = ellipse.prob), col = "#7dada8", alpha = 0.35, add = TRUE)
    }
    else{
      grps <- as.numeric(groups)
      grplbls <- unique(grps)
      colors <- gg_color(length(grplbls))

      for (i in 1:length(grplbls)){
        covmat <- MASS::cov.mve(scores[which(grps == i), choices], nsamp = 1500)
        rgl::wire3d(rgl::ellipse3d(x = as.matrix(covmat$cov), centre=c(covmat$center), smooth = F, level = ellipse.prob), alpha = 0.35, col = colors[i], add = TRUE)
      }
    }
  }
  if (var.axes){
    coords = NULL
    for (i in 1:nrow(rotation)){
      coords = rbind(coords, rbind(c(0,0,0), rotation[i, choices] * (abs(rotation[i, choices])^0.50/abs(rotation[i, choices]))))
    }
    rgl::lines3d(coords, col= "#2816f0", lwd = 3.125, alpha = 0.70)

    if (var.names){
      rgl::text3d((rotation[, choices] * (abs(rotation[, choices])^0.50/abs(rotation[, choices]))) , texts = rownames(rotation), col = "#ff0090", cex= 1.0925, font = 2)
    }
  }
}

#' Generate a Screeplot with Bias Corrected Bootstrap Confidence Intervals
#'
#' @param data a data frame or matrix of numeric variables
#' @param level the confidence level. defaults to 0.95.
#' @param nsamp number of bootstrap samples. defaults to 500.
#' @param method should the eigenvalues come from the correlation or
#' covariance matrix? one of "cor" (the default) or "cov"
#' @param parallel whether multiple cores should be used for the boostrap
#' analysis. defaults to TRUE.
#' @param ncores the number of cores to utilize for parallel processing.
#' defaults to 3.
#'
#'
#' @return a plot
#' @export
#'
#' @examples
#' scree.boot(diabetes[,-2])
#'
scree.boot <- function(data, level = 0.95, nsamp = 500, method = c("cor", "cov"), parallel = TRUE, ncores = 3){

  method <- match.arg(method)
  if (method=="cor"){
    values = eigen(cor(data))$values
    boot.fun <- function(data){
      kern <- c("epan", "tri", "optco", "biweight")
      kern <- kern[sample(1:4, 1)]
      eigen(cor(rmvk(nrow(data), y = data, kernel = kern, shrinked = F)),T,T)$values
    }
  }
  else{
    values = eigen(cov(data))$values
    boot.fun <- function(data){
      kern <- c("epan", "tri", "optco", "biweight")
      kern <- kern[sample(1:4, 1)]
      eigen(cov(rmvk(nrow(data), y = data, kernel = kern, shrinked = T)), T, T)$values
    }
  }

  suppressPackageStartupMessages(require(parallel))
  suppressPackageStartupMessages(require(doParallel))
  suppressPackageStartupMessages(require(foreach))
  if (parallel){
    if (is.null(ncores)) {ncores <- parallel::detectCores()}
    cl <- parallel::makeCluster(ncores)
    doParallel::registerDoParallel(cl)
    boot = foreach(i = 1:nsamp, .packages = "kernelboot", .combine = rbind) %dopar%
      boot.fun(data)
    parallel::stopCluster(cl)
  }
  else{
    boot = sapply(1:nsamp, function(n) boot.fun(data))
    boot = t(boot)
  }

  cis = apply(boot, 2, function(i) bci(i, conf.level = level)); cis = t(cis)
  l1med <- L1median(boot)$center
  plot(1:length(values), values, type = "b", xlab = "index",
       ylab = "eigenvalue", lwd = 1.75, col = "#1b241d",
       ylim = c(min(c(values, min(boot), 0)), max(c(values, max(boot))))
       )
  points(1:length(values), cis[,1], pch = "-", col = "#e03b07E6", cex = 1.125)
  points(1:length(values), cis[,2], pch = "+", col = "#03949cE6", cex = 1.125)
  points(1:length(values), l1med, pch = "*", col = "#857487E6", cex = 1.5)
  lines(1:length(values), rep(1, length(values)), col = "#a8a7a7CC", lty = 2)
  lines(1:length(values), rep(0.0001, length(values)), col = "#7a002bCC", lty = 2)
}


#' Outlier plot for PrincipalComp and facanal objects
#'
#' @param fit a PrincipalComp object (any of the PCA methods in this package,
#' however, pcaKern and pca Local do not produce the distances needed for making
#' this plot due to their inherently different nature).
#' @param plot defaults to TRUE. if FALSE it returns a list of vectors identifying
#' orthogonal outliers and leverage points.
#' @param interact if TRUE you can identify the data point. defaults to FALSE.
#'
#' @return a plot or a list
#' @export
#'
pcaoutliers <- function(fit, plot = T, score3d = T, interact = F){

  if (class(fit) != "PrincipalComp" && class(fit) != "facanal"){
    return(cat(cvreg:::.red2("Please supply a model of class PrincipalComp or facanal")))
  }
  if (attr(fit, "model")=="Kernel PCA"){
    return(cat(cvreg:::.hotpink("Kernel PCA is not supported")))
  }
  if (attr(fit, "model")=="Local PCA"){
    return(cat(cvreg:::.hotpink("Local PCA is not supported")))
  }
  ncomp <- ncol(fit$scores)
  nobs <- nrow(fit$scores)
  orthdist <- fit$orthdist
  scoredist <- fit$scoredist
  if (attr(fit, "model") == "PCA" ||  attr(fit, "model") == "Probabilistic PCA" || attr(fit, "model") == "Factor Analysis (EM-Algorithm)" || attr(fit, "model") == "Alpha Factor Analysis"){
    od.thresh <- (mean(orthdist^0.666) + sd(orthdist^0.666) * qnorm(0.975))^(1.5)
  }
  else{
    od.thresh <- (hdmedian(orthdist^0.666) + aad(orthdist^0.666) * qnorm(0.975))^(1.5)
  }

  sd.thresh <- sqrt(qchisq(0.972, ncomp))
  vec<-c(1:nrow(fit$scores))
  chkod<-ifelse(orthdist>od.thresh,1,0)
  chksd<-ifelse(scoredist>sd.thresh,1,0)
  chklevout <- chkod + chksd
  idlev<-vec[chksd==1]
  idout<-vec[chkod==1]
  idlevout<-vec[chklevout==2]

  if (plot || score3d){
    df=list(lev=chksd,out=chkod,lev.out=chkod)
    df$lev.out<-rep("clean", nobs)
    for (i in 1:nobs){
      if(sum(df$lev[i],df$out[i])==2){df$lev.out[i] <- "lev.out"}
      if(sum(df$lev[i],df$out[i])==1 && df$lev[i] == 1){df$lev.out[i] <- "lev"}
      if(sum(df$lev[i],df$out[i])==1 && df$out[i] == 1){df$lev.out[i] <- "orthout"}
    }
  }
  if(plot){
    plot(orthdist,scoredist,xlab="Orthogonal Distance",
         ylab="Score Distance",
         col="#ffffffCC",
         ylim=c(min(scoredist), max(c(sd.thresh * 1.1, max(scoredist)))),
         xlim=c(0, max(orthdist))
    )

    points(orthdist[df$lev.out=="lev.out"], scoredist[df$lev.out=="lev.out"], pch = 25, col = "#3b0000D9", bg= "#FF2344F2")
    points(orthdist[df$lev.out=="lev"], scoredist[df$lev.out=="lev"], pch = 23, col = "#590058D9", bg = "#580087CC")
    points(orthdist[df$lev.out=="orthout"], scoredist[df$lev.out=="orthout"], pch = 24, col = "#8c5200E6", bg = "#e39c0eF2")
    points(orthdist[df$lev.out=="clean"], scoredist[df$lev.out=="clean"], pch = 21, col = "#094f43D9", bg = "#2CBCE0B9")

    abline(v=od.thresh,col="#450e0eCC",lwd=2)
    abline(h=sd.thresh,col="#450e0eCC",lwd=2)
    if(interact) {identify(orthdist,scoredist)}
  }
  if (score3d && ncomp >= 3){
    cols <- c("#FF2344", "#580087", "#e39c0e", "#2CBCE0")
    levels <- as.numeric(factor(df$lev.out, levels = c("lev.out", "lev", "orthout", "clean")))
    colors <- cols[levels]
    rgl::plot3d(fit$scores[,1:3], col = colors, type = "s", size = 1.25, alpha = 1,
                xlab = colnames(fit$scores)[1], ylab = colnames(fit$scores)[2],
                zlab = colnames(fit$scores)[3])
  }
  else{
    list(good.lev = setdiff(idlev, idlevout), orthog.out = idout, lev.out = idlevout)
  }
}
abnormally-distributed/cvreg documentation built on May 3, 2020, 3:45 p.m.