#' PCA biplots with ggplot2
#'
#' @param fit object types supported from this package: "PrincipalComp", "sdr", "gpls", "gpcr", "ldarob", "facanal". \cr \cr
# objects from other packages that are supported: "prcomp" (stats), "princomp" (stats), "principal" (psych), "PCA" (FactoMineR),
#' "Pca" (rrcov), "PcaRobust" (rrcov), "mvPCA" (MNM), "ics" (ICS), "factanal" (stats), "fa" (psych), "lda" (MASS)
#' @param choices a vector of length 2 indicating which components to plot. defaults to the first two, ie c(1, 2).
#' @param scale covariance biplot (scale = 1), form biplot (scale = 0). when scale = 1, the inner product between the variables approximates the covariance and the distance between the points approximates the Mahalanobis distance.
#' @param obs.scale scale factor to apply to observations
#' @param var.scale scale factor to apply to variables
#' @param pc.biplot for compatibility with biplot.princomp()
#' @param coord_equal should the coordinate system be set to maintain a symmetric shaped plot? defaults to FALSE.
#' @param groups optional factor variable indicating the groups that the observations belong to. if provided the points will be colored according to groups
#' @param ellipse draw a gaussian data ellipse for each group?
#' @param ellipse.prob size of the ellipse in gaussian probability
#' @param kde if TRUE, plots the kernel density estimate contours instead of points. useful for dense data. defaults to FALSE.
#' @param labels optional vector of labels for the observations
#' @param labels.size size of the text used for the labels
#' @param alpha alpha transparency value for the points (0 = transparent, 1 = opaque)
#' @param circle draw a correlation circle? (only applies when prcomp was called with scale = TRUE and when var.scale = 1)
#' @param var.axes draw arrows for the variables?
#' @param var.names should labels be drawn for the variable axes?
#' @param varname.size size of the text for variable names
#' @param varname.adjust adjustment factor the placement of the variable names, >= 1 means farther from the arrow
#' @param varname.abbrev whether or not to abbreviate the variable names
#' @param lwd the size of the variable axes. defaults to 1.25.
#' @param klwd the thickness of the kde contours if kde=TRUE. defaults to 1.5.
#'
#' @return a ggplot2 plot
#' @export
#' @examples
#' data(wine)
#' wine.pca <- PrincipalComp(wine, ncomp = 3)
#' print(ggBiplot(wine.pca, groups = wine.class, ellipse = TRUE))
#'
ggBiplot = function (fit, choices = c(1, 2), groups = NULL, pc.biplot = T, coord_equal = F,
ellipse = F, ellipse.prob = 0.90, kde = F, circle = F, circle.prob = 0.90,
var.axes = T, var.names = T, varname.size = 4.125, varname.adjust = 1, varname.abbrev = T,
labels = NULL, labels.size = 3, scale = 1, obs.scale = 1 - scale, var.scale = scale, alpha = 0.75, lwd = 1.25, klwd = 1.5,...)
{
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(plyr))
suppressPackageStartupMessages(require(scales))
suppressPackageStartupMessages(require(grid))
gg_color <- function(n) {
hues = seq(22, 355, length = n + 1)
colorspace::darken(hcl(h = hues, l = 80, c = 100)[1:n], 0.40, space = "HCL")
}
stopifnot(length(choices) == 2)
if (inherits(fit, "prcomp")) {
nobs.factor <- sqrt(nrow(fit$x) - 1)
d <- fit$sdev
u <- sweep(fit$x, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$rotation
d.total <- sum(d^2)
}
else if (inherits(fit, "princomp")) {
nobs.factor <- sqrt(fit$n.obs)
d <- fit$sdev
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "PrincipalComp")) {
if (attr(fit, "model")== "Sparse PCA" || attr(fit, "model") == "Robust Sparse PCA"){
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$values)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
v <- fit$loadings[-which(rowSums(v[,choices]) == 0), ]
d.total <- sum(d^2)
}
else{
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$values)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
}
else if (inherits(fit, "mvPCA")) {
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$EigenV)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "ics")){
nobs.factor <- sqrt(nrow(fit@Scores))
d <- sqrt(fit@gKurt)
u <- sweep(fit@Scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit@UnMix; rownames(v) <- fit@DataNames; colnames(v) <- colnames(fit@Scores)
d.total <- sum(d^2)
}
else if (inherits(fit, "PCA")) {
nobs.factor <- sqrt(nrow(fit$call$X))
d <- unlist(sqrt(fit$eig)[1])
u <- sweep(fit$ind$coord, 2, 1/(d * nobs.factor), FUN = "*")
v <- sweep(fit$var$coord, 2, sqrt(fit$eig[1:ncol(fit$var$coord), 1]), FUN = "/")
d.total <- sum(d^2)
}
else if (inherits(fit, "Pca")){
nobs.factor <- sqrt(fit@n.obs)
d <- sqrt(fit@eigenvalues)
u <- sweep(fit@scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit@loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "lda")) {
nobs.factor <- sqrt(fit$N)
d <- fit$svd
u <- predict(fit)$x/nobs.factor
v <- fit$scaling
d.total <- sum(d^2)
}
else if (inherits(fit, "ldarob")) {
nobs.factor <- sqrt(fit$N)
d <- fit$svd
u <- predict(fit, type = "LD")/nobs.factor
v <- fit$scaling
d.total <- sum(d^2)
}
else if (inherits(fit, "sdr")) {
nobs.factor <- sqrt(length(fit$y.obs))
d <- svd(fit$predictors)$d^0.50
u <- sweep(fit$predictors, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$basis
d.total <- sum(d^2)
}
else if (inherits(fit, "gpls")) {
nobs.factor <- sqrt(length(fit$y_obs))
d <- svd(fit$factor.scores)$d^0.50
u <- sweep(fit$factor.scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$factor.loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "gpcr")) {
nobs.factor <- sqrt(length(fit$linear.predictor))
d <- svd(fit$scores)$d^0.50
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "factanal")) {
nobs.factor <- sqrt(length(fit$n.obs))
d <- svd(fit$scores)$d^0.50
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "facanal")) {
nobs.factor <- sqrt(length(fit$n.obs))
d <- svd(fit$scores)$d^0.50
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (any(class(fit) == "fa") && any(class(fit) == "psych")){
nobs.factor <- sqrt(length(fit$n.obs))
d <- sqrt(fit$e.values[1:ncol(fit$scores)])
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (any(class(fit) == "principal") && any(class(fit) == "psych")){
nobs.factor <- sqrt(length(fit$n.obs))
d <- sqrt(fit$values[1:ncol(fit$scores)])
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else {
stop("No compatible class detected.")
}
choices <- pmin(choices, ncol(u))
df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, FUN = "*"))
v <- sweep(v, 2, d^var.scale, FUN = "*")
df.v <- as.data.frame(v[, choices])
names(df.u) <- c("xvar", "yvar")
names(df.v) <- names(df.u)
if (pc.biplot) {
df.u <- df.u * nobs.factor
}
r <- sqrt(qchisq(circle.prob, df = 4)) * prod(colMeans(df.u^2))^(1/4)
v.scale <- rowSums(v^2)
df.v <- r * df.v/sqrt(max(v.scale))
if (inherits(fit, "princomp") || inherits(fit, "PCA") || inherits(fit, "prcomp") || any(class(fit) == "principal") || inherits(fit, "mvPCA") || inherits(fit, "Pca")) {
u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
}
else if (inherits(fit, "PrincipalComp")){
if (attr(fit, "model") == "Curvilinear PCA"){
u.axis.labs <- paste0("Curvilinear Component ", choices, sep = " ")
}
else if (attr(fit, "model") == "Kernel PCA"){
u.axis.labs <- paste0("Kernel Component ", choices, sep = " ")
}
else if (attr(fit, "model") == "Sample Dependent LPP"){
u.axis.labs <- paste0("Dimension ", choices, sep = " ")
}
else{
if (attr(fit, "rotation") != "none"){
u.axis.labs <- paste0("Rotated Component ", choices, sep = " ")
} else {
u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
}
}
}
else if (inherits(fit, "ics")){
u.axis.labs <- paste0("Independent Component ", choices, sep = " ")
}
else if (inherits(fit, "lda") || inherits(fit, "ldarob")){
u.axis.labs <- paste("Linear Discriminant ", choices, sep = " ")
}
else if (inherits(fit, "sdr")){
u.axis.labs <- paste0("Sufficient Predictor ", choices, sep = " ")
}
else if (inherits(fit, "gpls")){
u.axis.labs <- paste0("Factor ", choices, sep = " ")
}
else if (inherits(fit, "gpcr")){
u.axis.labs <- paste0("Principal Component ", choices, sep = " ")
}
else if (inherits(fit, "factanal") || any(class(fit) == "fa") || class(fit) == "facanal"){
u.axis.labs <- paste0("Factor ", choices, sep = " ")
}
if (!is.null(labels)) {
df.u$labels <- labels
}
if (!is.null(groups)) {
df.u$groups <- groups
}
if (varname.abbrev) {
df.v$varname <- abbreviate(rownames(v), named = F, method = 'both.sides', minlength = 3)
}
else {
df.v$varname <- rownames(v)
}
df.v$angle <- with(df.v, (180/pi) * atan(yvar/xvar))
df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar))/2)
g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) +
xlab(u.axis.labs[1]) +
ylab(u.axis.labs[2]) +
expand_limits(x = grDevices::extendrange(r = range(df.u$xvar), f = 0.05),
y = grDevices::extendrange(r = range(df.u$yvar), f = 0.05))
if (coord_equal){
g <- g + coord_equal()
}
## Options for Kernel Density Estimates are desired in lieu of points
if (kde) {
if (!is.null(df.u$groups)) {
g <- g + stat_density_2d(aes(color = groups), alpha = max(0.20, alpha * 0.9125), size = klwd)
}
else {
g <- g + stat_density_2d(alpha = max(0.20, alpha * 0.85), color = "#26ad41", size = klwd)
}
}
else {
if (!is.null(df.u$groups)) {
g <- g + geom_point(aes(color = groups, fill = groups), alpha = max(0.10, alpha * .65), shape = 21, size = 2.5) +
scale_fill_manual(values = gg_color(length(unique(df.u$groups))), aesthetics = "fill")
}
else {
g <- g + geom_point(alpha = max(0.10, alpha * 0.95), color = "#d99800", fill = "#e09a0d", shape = 21, size = 2.5)
}
}
## Options for when variable axes (arrows) are enabled as well
## as when the circle is wanted without the arrows
if (var.axes) {
if (circle) {
theta <- c(seq(-pi, pi, length = 75), seq(pi, -pi,length = 75))
circle <- data.frame(xvar = r * cos(theta), yvar = r *sin(theta))
g <- g + geom_path(data = circle, color = "#001412", size = 0.50, alpha = 0.285)
}
g <- g + geom_segment(data = df.v,
aes(x = 0,
y = 0,
xend = xvar * 0.95,
yend = yvar * 0.95,
alpha = 0.75 * alpha),
arrow = arrow(length = unit(2/3, "picas")),
size = lwd,
color = "#1063ff",
alpha = 0.75 * alpha)
}
else if (circle && !var.axes){
theta <- c(seq(-pi, pi, length = 100), seq(pi, -pi,length = 100))
circle <- data.frame(xvar = r * cos(theta), yvar = r *sin(theta))
g <- g + geom_path(data = circle, color = "#a81c07", size = 0.60, alpha = 0.286)
}
## Labels for Individual Observations
if (!is.null(df.u$labels)) {
if (!is.null(df.u$groups)) {
g <- g + geom_text(aes(label = labels, color = groups), size = labels.size)
}
else {
g <- g + geom_text(aes(label = labels), size = labels.size)
}
}
## Ellipse options when there are group IDs supplied
if (!is.null(df.u$groups) && ellipse) {
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
ell <- ddply(df.u, "groups", function(x) {
if (nrow(x) <= 2) {
return(NULL)
}
sigma <- var(cbind(x$xvar, x$yvar))
mu <- c(mean(x$xvar), mean(x$yvar))
ed <- sqrt(qchisq(ellipse.prob, df = 2))
data.frame(sweep(circle %*% chol(sigma) * ed, 2,
mu, FUN = "+"), groups = x$groups[1])
})
names(ell)[1:2] <- c("xvar", "yvar")
g <- g + geom_path(data = ell, aes(color = groups, group = groups))
}
## Add variable axis labels last so that they are readable
if (var.axes && var.names) {
g <- g + geom_text(data = df.v,
aes(label = varname,
x = xvar, y = yvar,
angle = angle,
hjust = hjust,
fontface = "bold"),
color = "#eb009c",
size = varname.size,
alpha = 0.85)
}
return(g)
}
#' triplot: 3D biplots for PCA and more
#'
#' @param fit the model fit
#' @param choices the components to plot
#' @param groups optional group labels
#' @param scale covariance biplot (scale = 1), form biplot (scale = 0). when scale = 1, the inner product between the variables approximates the covariance and the distance between the points approximates the Mahalanobis distance.
#' @param obs.scale scale factor to apply to observations
#' @param var.scale scale factor to apply to variables
#' @param var.axes should loading arrows be drawn? defaults to TRUE.
#' @param var.labels should the arrows be labeled? defaults to TRUE.
#' @param pc.biplot for compatibility with biplot.princomp()
#' @param ellipse draw a gaussian data ellipse for each group?
#' @param ellipse.prob size of the ellipse in gaussian probability
#' @param shape one of "p" for 2 dimensional points, or "s" for spheres. if left as NULL, "s" is used for smaller sample sizes (N < 250)
#' and "p" is used otherwise to ease computational load on computers without a dedicated GPU.
#' @param size a vector of length 2 of sizes for the data. defaults to 1 for shperes and 4 for points. if only one number is provided, be sure to select which shape it goes with and don't leave shape as NULL.
#'
#' @return an rgl plot
#' @export
#'
triplot = function(fit, choices = c(1,2,3), groups = NULL, ellipse = F, ellipse.prob = 0.95, scale = 1, pc.biplot = T,
var.axes = T, var.names = T, obs.scale = 1 - scale, var.scale = scale, shape = NULL, size = c(s=1, p=4)){
gg_color <- function(n) {
hues = seq(14, 356, length = n + 1)
colorspace::darken(hcl(h = hues, l = 60, c = 120)[1:n], 0.10, space = "HCL")
}
get_colors <- function(groups, group.col = palette()){
groups <- as.factor(groups)
ngrps <- length(levels(groups))
if(ngrps > length(group.col))
group.col <- rep(group.col, ngrps)
color <- group.col[as.numeric(groups)]
names(color) <- as.vector(groups)
return(color)
}
if (inherits(fit, "prcomp")) {
nobs.factor <- sqrt(nrow(fit$x) - 1)
d <- fit$sdev
u <- sweep(fit$x, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$rotation
d.total <- sum(d^2)
}
else if (inherits(fit, "princomp")) {
nobs.factor <- sqrt(fit$n.obs)
d <- fit$sdev
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "PrincipalComp")) {
if (attr(fit, "model")== "Sparse PCA" || attr(fit, "model") == "Robust Sparse PCA"){
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$values)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
v <- fit$loadings[-which(rowSums(v[,choices]) == 0), ]
d.total <- sum(d^2)
}
else{
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$values)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
}
else if (inherits(fit, "mvPCA")) {
nobs.factor <- sqrt(nrow(fit$scores))
d <- sqrt(fit$EigenV)
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "PCA")) {
nobs.factor <- sqrt(nrow(fit$call$X))
d <- unlist(sqrt(fit$eig)[1])
u <- sweep(fit$ind$coord, 2, 1/(d * nobs.factor), FUN = "*")
v <- sweep(fit$var$coord, 2, sqrt(fit$eig[1:ncol(fit$var$coord), 1]), FUN = "/")
d.total <- sum(d^2)
}
else if (inherits(fit, "Pca")){
nobs.factor <- sqrt(fit@n.obs)
d <- sqrt(fit@eigenvalues)
u <- sweep(fit@scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit@loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "ics")){
nobs.factor <- sqrt(nrow(fit@Scores))
d <- sqrt(fit@gKurt)
u <- sweep(fit@Scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit@UnMix
d.total <- sum(d^2)
}
else if (inherits(fit, "lda")) {
nobs.factor <- sqrt(fit$N)
d <- fit$svd
u <- predict(fit)$x/nobs.factor
v <- fit$scaling
d.total <- sum(d^2)
}
else if (inherits(fit, "ldarob")) {
nobs.factor <- sqrt(fit$N)
d <- fit$svd
u <- predict(fit, type = "LD")/nobs.factor
v <- fit$scaling
d.total <- sum(d^2)
}
else if (inherits(fit, "sdr")) {
nobs.factor <- sqrt(length(fit$y.obs))
d <- svd(fit$predictors)$d^0.50
u <- sweep(fit$predictors, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$basis
d.total <- sum(d^2)
}
else if (inherits(fit, "gpls")) {
nobs.factor <- sqrt(length(fit$y_obs))
d <- svd(fit$factor.scores)$d^0.50
u <- sweep(fit$factor.scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$factor.loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "gpcr")) {
nobs.factor <- sqrt(length(fit$linear.predictor))
d <- svd(fit$scores)$d^0.50
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (inherits(fit, "factanal") || class(fit) == "facanal") {
nobs.factor <- sqrt(length(fit$n.obs))
d <- svd(fit$scores)$d^0.50
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (any(class(fit) == "fa") && any(class(fit) == "psych")){
nobs.factor <- sqrt(length(fit$n.obs))
d <- sqrt(fit$e.values[1:ncol(fit$scores)])
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else if (any(class(fit) == "principal") && any(class(fit) == "psych")){
nobs.factor <- sqrt(length(fit$n.obs))
d <- sqrt(fit$values[1:ncol(fit$scores)])
u <- sweep(fit$scores, 2, 1/(d * nobs.factor), FUN = "*")
v <- fit$loadings
d.total <- sum(d^2)
}
else {
stop("No compatible class detected.")
}
choices <- pmin(choices, ncol(u))
df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, FUN = "*"))
v <- sweep(v, 2, d^var.scale, FUN = "*")
df.v <- as.data.frame(v[, choices])
if (pc.biplot) {
df.u <- df.u * nobs.factor
}
r <- sqrt(qchisq(0.95, df = 4)) * prod(colMeans(df.u^2))^(1/4)
v.scale <- rowSums(v^2)
df.v <- r * df.v/sqrt(max(v.scale))
scores <- df.u
rotation <- df.v
rownames(rotation) <- abbreviate(rownames(rotation), named = F, method = 'both.sides', minlength = 2)
if (is.null(shape)) {
if (nobs.factor < 251){
shape = "s"
size = 1
alpha = 1
} else {
shape = "p"
size = 4
alpha = 0.95
}
}
else if (!is.null(shape)){
if (length(size) == 2){
size <- switch(shape,
"s" = size[1],
"p" = size[2])
alpha = 0.60
}
else{
size <- size
alpha = 0.60
}
}
if (is.null(groups)){
colors = "#837287"
}
else {
colors = get_colors(groups, gg_color(length(unique(groups))))
}
rgl::plot3d(scores, col = colors, type = shape, size = size, alpha = alpha)
if (ellipse == TRUE) {
if (is.null(groups)){
covmat <- list()
covmat$cov <- cov(scores)
covmat$center <- cvreg::colMedians(scores)
rgl::wire3d(rgl::ellipse3d(x = as.matrix(covmat$cov), centre=c(covmat$center), smooth = F, level = ellipse.prob), col = "#7dada8", alpha = 0.35, add = TRUE)
}
else{
grps <- as.numeric(groups)
grplbls <- unique(grps)
colors <- gg_color(length(grplbls))
for (i in 1:length(grplbls)){
covmat <- MASS::cov.mve(scores[which(grps == i), choices], nsamp = 1500)
rgl::wire3d(rgl::ellipse3d(x = as.matrix(covmat$cov), centre=c(covmat$center), smooth = F, level = ellipse.prob), alpha = 0.35, col = colors[i], add = TRUE)
}
}
}
if (var.axes){
coords = NULL
for (i in 1:nrow(rotation)){
coords = rbind(coords, rbind(c(0,0,0), rotation[i, choices] * (abs(rotation[i, choices])^0.50/abs(rotation[i, choices]))))
}
rgl::lines3d(coords, col= "#2816f0", lwd = 3.125, alpha = 0.70)
if (var.names){
rgl::text3d((rotation[, choices] * (abs(rotation[, choices])^0.50/abs(rotation[, choices]))) , texts = rownames(rotation), col = "#ff0090", cex= 1.0925, font = 2)
}
}
}
#' Generate a Screeplot with Bias Corrected Bootstrap Confidence Intervals
#'
#' @param data a data frame or matrix of numeric variables
#' @param level the confidence level. defaults to 0.95.
#' @param nsamp number of bootstrap samples. defaults to 500.
#' @param method should the eigenvalues come from the correlation or
#' covariance matrix? one of "cor" (the default) or "cov"
#' @param parallel whether multiple cores should be used for the boostrap
#' analysis. defaults to TRUE.
#' @param ncores the number of cores to utilize for parallel processing.
#' defaults to 3.
#'
#'
#' @return a plot
#' @export
#'
#' @examples
#' scree.boot(diabetes[,-2])
#'
scree.boot <- function(data, level = 0.95, nsamp = 500, method = c("cor", "cov"), parallel = TRUE, ncores = 3){
method <- match.arg(method)
if (method=="cor"){
values = eigen(cor(data))$values
boot.fun <- function(data){
kern <- c("epan", "tri", "optco", "biweight")
kern <- kern[sample(1:4, 1)]
eigen(cor(rmvk(nrow(data), y = data, kernel = kern, shrinked = F)),T,T)$values
}
}
else{
values = eigen(cov(data))$values
boot.fun <- function(data){
kern <- c("epan", "tri", "optco", "biweight")
kern <- kern[sample(1:4, 1)]
eigen(cov(rmvk(nrow(data), y = data, kernel = kern, shrinked = T)), T, T)$values
}
}
suppressPackageStartupMessages(require(parallel))
suppressPackageStartupMessages(require(doParallel))
suppressPackageStartupMessages(require(foreach))
if (parallel){
if (is.null(ncores)) {ncores <- parallel::detectCores()}
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
boot = foreach(i = 1:nsamp, .packages = "kernelboot", .combine = rbind) %dopar%
boot.fun(data)
parallel::stopCluster(cl)
}
else{
boot = sapply(1:nsamp, function(n) boot.fun(data))
boot = t(boot)
}
cis = apply(boot, 2, function(i) bci(i, conf.level = level)); cis = t(cis)
l1med <- L1median(boot)$center
plot(1:length(values), values, type = "b", xlab = "index",
ylab = "eigenvalue", lwd = 1.75, col = "#1b241d",
ylim = c(min(c(values, min(boot), 0)), max(c(values, max(boot))))
)
points(1:length(values), cis[,1], pch = "-", col = "#e03b07E6", cex = 1.125)
points(1:length(values), cis[,2], pch = "+", col = "#03949cE6", cex = 1.125)
points(1:length(values), l1med, pch = "*", col = "#857487E6", cex = 1.5)
lines(1:length(values), rep(1, length(values)), col = "#a8a7a7CC", lty = 2)
lines(1:length(values), rep(0.0001, length(values)), col = "#7a002bCC", lty = 2)
}
#' Outlier plot for PrincipalComp and facanal objects
#'
#' @param fit a PrincipalComp object (any of the PCA methods in this package,
#' however, pcaKern and pca Local do not produce the distances needed for making
#' this plot due to their inherently different nature).
#' @param plot defaults to TRUE. if FALSE it returns a list of vectors identifying
#' orthogonal outliers and leverage points.
#' @param interact if TRUE you can identify the data point. defaults to FALSE.
#'
#' @return a plot or a list
#' @export
#'
pcaoutliers <- function(fit, plot = T, score3d = T, interact = F){
if (class(fit) != "PrincipalComp" && class(fit) != "facanal"){
return(cat(cvreg:::.red2("Please supply a model of class PrincipalComp or facanal")))
}
if (attr(fit, "model")=="Kernel PCA"){
return(cat(cvreg:::.hotpink("Kernel PCA is not supported")))
}
if (attr(fit, "model")=="Local PCA"){
return(cat(cvreg:::.hotpink("Local PCA is not supported")))
}
ncomp <- ncol(fit$scores)
nobs <- nrow(fit$scores)
orthdist <- fit$orthdist
scoredist <- fit$scoredist
if (attr(fit, "model") == "PCA" || attr(fit, "model") == "Probabilistic PCA" || attr(fit, "model") == "Factor Analysis (EM-Algorithm)" || attr(fit, "model") == "Alpha Factor Analysis"){
od.thresh <- (mean(orthdist^0.666) + sd(orthdist^0.666) * qnorm(0.975))^(1.5)
}
else{
od.thresh <- (hdmedian(orthdist^0.666) + aad(orthdist^0.666) * qnorm(0.975))^(1.5)
}
sd.thresh <- sqrt(qchisq(0.972, ncomp))
vec<-c(1:nrow(fit$scores))
chkod<-ifelse(orthdist>od.thresh,1,0)
chksd<-ifelse(scoredist>sd.thresh,1,0)
chklevout <- chkod + chksd
idlev<-vec[chksd==1]
idout<-vec[chkod==1]
idlevout<-vec[chklevout==2]
if (plot || score3d){
df=list(lev=chksd,out=chkod,lev.out=chkod)
df$lev.out<-rep("clean", nobs)
for (i in 1:nobs){
if(sum(df$lev[i],df$out[i])==2){df$lev.out[i] <- "lev.out"}
if(sum(df$lev[i],df$out[i])==1 && df$lev[i] == 1){df$lev.out[i] <- "lev"}
if(sum(df$lev[i],df$out[i])==1 && df$out[i] == 1){df$lev.out[i] <- "orthout"}
}
}
if(plot){
plot(orthdist,scoredist,xlab="Orthogonal Distance",
ylab="Score Distance",
col="#ffffffCC",
ylim=c(min(scoredist), max(c(sd.thresh * 1.1, max(scoredist)))),
xlim=c(0, max(orthdist))
)
points(orthdist[df$lev.out=="lev.out"], scoredist[df$lev.out=="lev.out"], pch = 25, col = "#3b0000D9", bg= "#FF2344F2")
points(orthdist[df$lev.out=="lev"], scoredist[df$lev.out=="lev"], pch = 23, col = "#590058D9", bg = "#580087CC")
points(orthdist[df$lev.out=="orthout"], scoredist[df$lev.out=="orthout"], pch = 24, col = "#8c5200E6", bg = "#e39c0eF2")
points(orthdist[df$lev.out=="clean"], scoredist[df$lev.out=="clean"], pch = 21, col = "#094f43D9", bg = "#2CBCE0B9")
abline(v=od.thresh,col="#450e0eCC",lwd=2)
abline(h=sd.thresh,col="#450e0eCC",lwd=2)
if(interact) {identify(orthdist,scoredist)}
}
if (score3d && ncomp >= 3){
cols <- c("#FF2344", "#580087", "#e39c0e", "#2CBCE0")
levels <- as.numeric(factor(df$lev.out, levels = c("lev.out", "lev", "orthout", "clean")))
colors <- cols[levels]
rgl::plot3d(fit$scores[,1:3], col = colors, type = "s", size = 1.25, alpha = 1,
xlab = colnames(fit$scores)[1], ylab = colnames(fit$scores)[2],
zlab = colnames(fit$scores)[3])
}
else{
list(good.lev = setdiff(idlev, idlevout), orthog.out = idout, lev.out = idlevout)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.