inst/doc/Correspondence_analysis_examples.R

## ----echo = FALSE-------------------------------------------------------------
knitr::opts_chunk$set(
  fig.width = 6,
  fig.height = 6,
  fig.align ='center'
)

## -----------------------------------------------------------------------------
library(cellWise)

## -----------------------------------------------------------------------------
data("data_clothes")
X <- data_clothes
dim(X)
# create matrix S as in paper:
sum(X) # total count is 4373
P    <- X / sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
cvec <- colSums(P) # vector of column totals
R    <- X / rowSums(X) # row profiles
rowSums(R) # all 1,  OK
S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% 
                             t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
round(S, 3)
d <- ncol(S)

# We verify that the points of S are on a hyperplane by construction:
(eigvals <- eigen(t(S) %*% S)$values)


## ----fig.height=8,fig.width=3-------------------------------------------------
DDC.out <- DDC(S)
ggp <- cellMap(R = DDC.out$stdResid,
               mTitle = "clothes data",
               rowtitle = "countries",
               columntitle = "brackets")

# pdf("clothes_cellmap.pdf", width = 3, height = 6)
plot(ggp)
# dev.off()

## -----------------------------------------------------------------------------
countryInds <- which(rownames(S) %in% 
                       c("GB", "SK", "MT", "GR", "BG", "LV", "RO"))
matplot(t(S[countryInds, ]), pch = 16, col = 1:7)
lines(apply(S, 2, median), lwd = 3) # median profile
# The profile of these countries deviate in a similar way: 
# they trade a lot of cheap clothes, and fewer expensive ones. 

## ----fig.height=6,fig.width=6-------------------------------------------------
svd.out <- svd(S)
svd.out$d# S is indeed singular:
(svd.out$d)^2 - eigen(t(S) %*% S)$values # ~0
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
# This can be shown in a scree plot.

# For plotting the rows:
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
# For plotting the columns = variables:
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)

# pdf(file="clothes_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlab="", ylab="",
     xlim = c(-1, 0.6), ylim = c(-0.6, 0.6))
title(main="Classical correspondence analysis of clothes data", 
      line=1)
text(rowproj[, 1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.93,0.85,0.78,0.84,0.87)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1) # , arr.adj = 0)
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()

## ----fig.height=6,fig.width=6-------------------------------------------------
# We apply MacroPCA with center fixed at zero.
# As in classical CA, we do not prescale S.
MacroPCApar0 <- list(scale = FALSE, center = rep(0,d))
MacroPCA.out <- MacroPCA(S, k=0, MacroPCApars = MacroPCApar0)

MacroPCA.out <- MacroPCA(S, k=2, MacroPCApars = MacroPCApar0)

(eigvals <- MacroPCA.out$eigenvalues)
diff(c(0,cumsum(eigvals)/sum(eigvals)))

# Compute the principal coordinates for the biplot.
# To make the plot match the orientation in Riani et al:
V <- -MacroPCA.out$loadings
Tscores <- -MacroPCA.out$scores
sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U <- Tscores %*% diag(1/sVals)
rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)

# pdf(file="clothes_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-0.95,0.65),
     ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of clothes data", 
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.9,0.8,0.5,0.75,0.85)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
# Matches Fig 4 of Riani quite well.

## -----------------------------------------------------------------------------
data("data_brands")
X <- data_brands
dim(X) 
sum(X) # total count is 11713
P    <- X/sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
hist(rvec) # Right tail: Chevrolet, Ford, Honda, Toyota
# These brands are well known and sold a lot in the US.
cvec <- colSums(P) # vector of column totals
R    <- X / rowSums(X) # row profiles
S    <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*% 
                                t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
d <- ncol(S)

## ----fig.height=8,fig.width=3-------------------------------------------------
DDC.out <- DDC(S)
ggp     <- cellMap(R = DDC.out$stdResid,
                   mTitle = "brands data",
                   rowtitle = "brands",
                   columntitle = "perceptions")
# pdf("brands_cellmap.pdf", width = 3.5, height = 8)
plot(ggp)
# dev.off()

# Volvo is most deviating (3 cells), followed by Hyundai
# (2 cells) and Maserati (2 cells).

## ----fig.height=6,fig.width=6-------------------------------------------------
svd.out <- svd(S)
svd.out$d # S is singular:
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
# Can be plotted in a scree plot.

# To match the plot in Riani at al:
svd.out$v[, 2] = -svd.out$v[, 2]
svd.out$u[, 2] = -svd.out$u[, 2]
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)


# pdf(file="brands_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
     ylim=c(-0.8,1.5), xlab="", ylab="")
title(main="Classical correspondence analysis of brands data", 
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.8,0.9,0.8,0.65,0.92,0.8,0.65)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()

## ----fig.height=6,fig.width=6-------------------------------------------------
MacroPCApar0 <- list(scale = FALSE, center = rep(0, d))
MacroPCA.out <- MacroPCA(S, k = 0, MacroPCApars = MacroPCApar0)

MacroPCA.out <- MacroPCA(S, k = 3, MacroPCApars = MacroPCApar0)


# Computations for the biplot.
V <- MacroPCA.out$loadings
scores <- MacroPCA.out$scores
# To make the plot match the orientation in Riani et al:
V[,2]      <- -V[,2]
scores[,2] <- -scores[,2]
sVals      <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U          <- scores %*% diag(1/sVals)
rowproj    <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj    <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)

# pdf(file="brands_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
     ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of brands data",
      line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.75,0.76,0.9,0.88,0.65,0.74,0.52)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2], 
              col = "red", arr.type="simple", arr.width=0.1,
              arr.length = 0.1)
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
# Roughly matches Figure 7 of Riani et al (2022).

Try the cellWise package in your browser

Any scripts or data that you put into this service are public.

cellWise documentation built on Oct. 25, 2023, 5:07 p.m.