library(knitr) library(learnr) knitr::opts_chunk$set(echo = TRUE, eval = TRUE)
Perform PCA on USArrests
data.
warning("princomp doesn't work")
The first thing we need to do is to load the data.
data(USArrests)
Summarise the data frame.
summary(USArrests)
Plot the pairs.
plot(USArrests, pch = 19)
Plot boxplots of the data.
# boxplot(as.matrix(USArrests), col = "gold")
Compute the Principal Components.
pc.USArrests <- princomp(USArrests, scores = T) pc.USArrests
Inspect the loadings.
pc.USArrests <- princomp(USArrests, scores = T) load.rec <- pc.USArrests$loadings load.rec
We report them graphically.
par(mar = c(1, 4, 0, 2), mfrow = c(4, 1)) for (i in 1:4) barplot(load.rec[, i], ylim = c(-1, 1))
We now consider the explained variance. Try to plot the explained variance per principal component.
layout(matrix(c(2, 3, 1, 3), 2, byrow = T)) barplot(pc.USArrests$sdev^2, las = 2, main = "Principal components", ylim = c(0, 7e3), ylab = "Variances") abline(h = 1, col = "blue") barplot(sapply(USArrests, sd)^2, las = 2, main = "Original variables", ylim = c(0, 7e3), ylab = "Variances") abline(h = 1, col = "blue") abline(h = 0.8, lty = 2, col = "blue") pc.USArrests <- princomp(USArrests, scores = T) axis(2, at = 0:10 / 10, labels = 0:10 / 10) axis(1, at = 1:ncol(USArrests), labels = 1:ncol(USArrests), las = 2)
Compute the scores and visualise the first ones.
scores.USArrests <- pc.USArrests$scores head(scores.USArrests)
And plot the scores to get a visual grasp!
layout(matrix(c(1, 2), 2)) boxplot(USArrests, las = 2, col = "red", main = "Original variables") scores.USArrests <- data.frame(scores.USArrests) boxplot(scores.USArrests, las = 2, col = "red", main = "Scores")
Finally, make a biplot.
biplot(pc.USArrests, scale = 0, cex = .7)
We now consider the standardised data. Standardise the data.
USArrests.sd <- scale(USArrests) USArrests.sd <- data.frame(USArrests.sd)
It is always a good idea to start with some data visualisation. Let us make a boxplot!
boxplot(USArrests.sd, col = "gold")
Let us compute the PCA.
pc.USArrests <- princomp(USArrests.sd, scores = T)
Inspect the loadings.
load.rec <- pc.USArrests$loadings load.rec
We also wish to make a graphical representation of the loadings of the principal components.
par(mar = c(1, 4, 0, 2), mfrow = c(4, 1)) for (i in 1:4) barplot(load.rec[, i], ylim = c(-1, 1))
Let us plot the explained variance per principal component.
USArrests.sd <- scale(USArrests) layout(matrix(c(2, 3, 1, 3), 2, byrow = T)) barplot(pc.USArrests$sdev^2, las = 2, main = "Principal components", ylim = c(0, 4), ylab = "Variances") abline(h = 1, col = "blue") barplot(sapply(USArrests.sd, sd)^2, las = 2, main = "Original variables", ylim = c(0, 4), ylab = "Variances") plot(cumsum(pc.USArrests$sd^2) / sum(pc.USArrests$sd^2), type = "b", axes = F, xlab = "number of components", ylab = "contribution to the total variace", ylim = c(0, 1)) abline(h = 1, col = "blue") abline(h = 0.8, lty = 2, col = "blue") box() axis(2, at = 0:10 / 10, labels = 0:10 / 10) axis(1, at = 1:ncol(USArrests.sd), labels = 1:ncol(USArrests.sd), las = 2)
Let us inspect the scores and inspect the first few.
scores.USArrests <- pc.USArrests$scores head(scores.USArrests)
We now wish to visualise the dispersion of the original data and the dispersion of the scores.
layout(matrix(c(1, 2), 2)) boxplot(USArrests.sd, las = 2, col = "red", main = "Original variables") scores.USArrests <- data.frame(scores.USArrests) boxplot(scores.USArrests, las = 2, col = "red", main = "Scores")
To conclude, let us make a biplot.
biplot(pc.USArrests, scale = 0, cex = .7)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.