Nothing
#' Specific Multiple Correspondence Analysis
#' @title soc.mca
#'
#' \code{soc.mca} performs a specific multiple correspondence analysis on a data.frame of factors, where cases are rows and columns are variables.
#' @param active Defines the active modalities in a data.frame with rows of individuals and columns of factors, without NA's'. Active can also be a named list of data.frames. The data.frames will correspond to the analytical headings.
#' @param sup Defines the supplementary modalities in a data.frame with rows of individuals and columns of factors, without NA's
#' @param identifier A single vector containing a single value for each row/individual in x and sup. Typically a name or an id.number.
#' @param passive A single character vector with the full or partial names of the passive modalities. All names that have a full or partial match will be set as passive.
#' @param Moschidis If TRUE adjusts contribution values for rare modalities. see \link{moschidis}.
#' @param detailed.results If FALSE the result object is trimmed to reduce its memory footprint.
#' @param weight a numeric vector with the weights for the individual rows. The weight is normalized afterwardsds.
#'
#' @return \item{nd}{Number of active dimensions}
#' \item{n.ind}{The number of active individuals}
#' \item{n.mod}{The number of active modalities}
#' \item{eigen}{Eigenvectors}
#' \item{total.inertia}{The sum of inertia}
#' \item{adj.inertia}{A matrix with all active dimensions, adjusted and unadjusted inertias. See \link{variance}}
#' \item{freq.mod}{Frequencies for the active modalities. See \link{add.to.label}}
#' \item{freq.sup}{Frequencies for the supplementary modalities. See \link{add.to.label}}
#' \item{ctr.mod}{A matrix with the contribution values of the active modalities per dimension. See \link{contribution}}
#' \item{ctr.ind}{A matrix with the contribution values of the individuals per dimension.}
#' \item{cor.mod}{The correlation or quality of each modality per dimension.}
#' \item{cor.ind}{The correlation or quality of each individual per dimension.}
#' \item{mass.mod}{The mass of each modality}
#' \item{coord.mod}{A matrix with the principal coordinates of each active modality per dimension.}
#' \item{coord.ind}{A matrix with the principal coordinates of each individual per dimension.}
#' \item{coord.sup}{A matrix with the principal coordinates of each supplementary modality per dimension.}
#' \item{names.mod}{The names of the active modalities}
#' \item{names.ind}{The names of the individuals}
#' \item{names.sup}{The names of the supplementary modalities}
#' \item{names.passive}{The names of the passive modalities}
#' \item{modal}{A matrix with the number of modalities per variable and their location}
#' \item{variable}{A character vector with the name of the variable of the active modalities}
#' \item{Rosenlund.tresh}{A numeric vector with the contribution values adjusted with the Rosenlund threshold, see: see p 92 in: Rosenlund, Lennart. Exploring the City with Bourdieu: Applying Pierre Bourdieu’s Theories and Methods to Study the Community. Saarbrücken: VDM Verlag Dr. Müller, 2009.}
#' \item{t.test.sup}{A matrix with a the student t-test of the coordinates of the supplementary variables}
#' \item{Share.of.var}{A matrix the share of variance for each variable}
#'
#' @name soc.mca
#' @references Le Roux, B., og H. Rouanet. 2010. Multiple correspondence analysis. Thousand Oaks: Sage.
#' @author Anton Grau Larsen
#' @author Jacob Lunding
#' @author Stefan Bastholm Andrade
#' @author Christoph Ellersgaard
#' @seealso \link{soc.csa}, \link{contribution}
#' @examples
#' # Loads the "taste" dataset included in this package
#' data(taste)
#' # Create a data frame of factors containing all the active variables
#' taste <- taste[which(taste$Isup == 'Active'), ]
#'
#' attach(taste)
#' active <- data.frame(TV, Film, Art, Eat)
#' sup <- data.frame(Gender, Age, Income)
#' detach(taste)
#'
#' # Runs the analysis
#' result <- soc.mca(active, sup)
#'
#' # Prints the results
#' result
#'
#' # A specific multiple correspondence analysis
#' # options defines what words or phrases that are looked for in the labels of the active modalities.
#' options(passive = c("Film: CostumeDrama", "TV: Tv-Sport"))
#' soc.mca(active, sup)
#' options(passive = NULL)
#' @export
soc.mca <- function(active, sup = NULL, identifier = NULL, passive = getOption("passive", default = "Missing"), weight = NULL, Moschidis = FALSE,
detailed.results = FALSE) {
# Preparing data
data.type <- what.is.x(active)
############################################################################
# If active is not a list and input data is not an indicator.matrix (which is default)
############################################################################
if (data.type == "data.frame") {
active <- data.frame(lapply(active, factor), check.names = F)
a.r <- nrow(active)
ind.act <- indicator(active)
headings = NULL # As default, no headings are defined
}
############################################################################
# If active is not a list and input data is an indicator.matrix, active is used as it is
############################################################################
if (data.type == "indicator") {
a.r <- nrow(active)
ind.act <- data.frame(active, check.names = F)
headings = NULL # As default, no headings are defined
}
############################################################################
# If active is a list and input data is an identicator.matrix, headings are created and active used as it is
############################################################################
if (data.type == "list.indicators") {
headings <- rep(names(active), sapply(active, ncol))
names(active) <- NULL
ind.act <- do.call("cbind", active)
}
############################################################################
# If active is a list but input data is not an indicator.matrix, headings are created and active is binarized
############################################################################
if (data.type == "list.data.frame") {
headings <- rep(names(active), sapply(active, length))
names(active) <- NULL
active <- do.call("cbind", active)
active <- data.frame(lapply(active, factor), check.names = F)
nl <- sapply(active, nlevels)
headings <- rep(headings, nl)
a.r <- nrow(active)
ind.act <- indicator(active)
}
#########################################
# Creating lists of variable names...
#########################################
varlist <- unique(gsub(": .*", "", colnames(ind.act))) # Unique varnames
varlist.long <- gsub(": .*", "", colnames(ind.act)) # Vector of varnames matching the list of modalities
Q <- median(rowSums(ind.act)) # Number of Questions [we use the median in order to allow for questions that do not sum to one]
passive.set <- grepl(paste(passive, collapse = "|"), colnames(ind.act)) # Defining the set of passive modalities, default is no passive
set <- 1:ncol(ind.act) # set is all
active.set <- set[!passive.set] # active.set is active modalities
ind.reduced <- ind.act[ ,active.set] # Reducing the indicator matrix to the active set of modalities
varlist.long.red <- gsub(": .*", "", colnames(ind.reduced)) # The reduced vector of var names, matching the reduced modality name vector
tmpx <- vector()
#for each variable,
for (i in 1:length(unique(varlist))) {
t1 <- ind.act[,varlist.long == unique(varlist)[i]]
t2 <- ind.reduced[,varlist.long.red == unique(varlist)[i]]
if (is.null(dim(t1)[2])) {
tmpx[i] <- max(ind.act[,varlist.long == unique(varlist)[i]] - ind.reduced[,varlist.long.red == unique(varlist)[i]])
}else{
tmpx[i] <- max(rowSums(ind.act[,varlist.long == unique(varlist)[i]]) - rowSums(ind.reduced[,varlist.long.red == unique(varlist)[i]]))
}
}
Qm <- Q - sum(tmpx) # Calculating the number of questions with passive modalities
###############################################
# Handling supplementary variables
###############################################
# If sup is empty no supplementary data is added
if (identical(sup, NULL) == TRUE) {
sup <- matrix(0, nrow = nrow(ind.act), ncol = 2)
sup[, 1:2] <- cbind(rep(0, nrow(ind.act)), rep(0, nrow(ind.act)))
colnames(sup) <- c("No supplementary points defined 1",
"No supplementary points defined 2")
ind.sup <- sup
}
# If sup is not empty a dataframe of supplementary data is created
if ((nrow(sup) == 0) == FALSE) {
if (all(sapply(sup, is.numeric))) { # if indidactor sup is used as is
ind.sup <- sup
}else{ # else an indicator is created
ind.sup <- indicator(sup)
} }
################################################################
# The actual analysis, a CA of the indicator matrix #
################################################################
result <- subset.ca.indicator(ind.act, ind.sup, active.set, passive.set, Q = Q, Qm = Qm, Moschidis = Moschidis, detailed.results = detailed.results, weight = weight)
result$variable.all <- varlist.long
# Creating 'numeric' identifier if no identifier is given (Default)
if (identical(identifier, NULL) == TRUE) {
identifier <- 1:nrow(ind.act)
}
result$names.mod.all <- colnames(ind.act)
result$names.mod <- colnames(ind.act)[active.set]
result$names.ind <- as.character(identifier)
result$names.sup <- colnames(ind.sup)
result$names.passive <- colnames(ind.act)[passive.set]
result$headings.all <- headings
result$headings <- headings[active.set]
result$indicator.matrix.passive <- ind.act[, passive.set]
result$indicator.matrix.active <- ind.act[, active.set]
result$indicator.matrix.all <- ind.act
#######################################################
# Creating modality overview table to output
#######################################################
x <- colnames(ind.act[, active.set])
varlist <- gsub(": .*", "", x)
x.all <- colnames(ind.act)
varlist.all <- gsub(": .*", "", x.all)
mm <- as.matrix(cbind(varlist, 1:length(varlist)))
mmx <- as.matrix(cbind(varlist.all, 1:length(varlist.all)))
md <- matrix(, nrow = length(unique(unlist(varlist))), ncol = 4)
rownames(md) <- unique(unlist(varlist))
colnames(md) <- c("Total nb. modalities", "Nb. active modalities", "Start", "End")
md <- as.data.frame(md)
for (i in 1:length(unique(unlist(varlist)))) {
mr <- as.numeric(mm[, 2][mm[, 1] == unique(unlist(varlist))[i]])
mx <- as.numeric(mmx[, 2][mmx[, 1] == unique(unlist(varlist.all))[i]])
md[i, 1] <- length(mx)
md[i, 2] <- length(mr)
md[i, 3] <- min(mr)
md[i, 4] <- max(mr)
}
md[, 1] <- as.numeric(md[, 1])
md[, 2] <- as.numeric(md[, 2])
result$modal <- md
###########################################################################
# Calculating 'The Rosenlund treshold' for each modality
###########################################################################
result$Rosenlund.tresh <- rep(1/result$Q/result$modal$`Nb. active modalities`, times = result$modal$`Nb. active modalities`)
result$Rosenlund.tresh.all <- rep(1/result$Q/result$modal$`Total nb. modalities`, times = result$modal$`Total nb. modalities`)
#############################################
# The variable vectors
#############################################
variable <- vector()
for (i in 1:nrow(md)) {
variable <- c(variable, rep(rownames(md)[i], md[i, "Nb. active modalities"]))
}
result$variable <- variable
if (identical(sup, NULL) == FALSE) {
varnames <- colnames(sup)
ml <- vector()
for (i in 1:ncol(sup)) {
ml <- c(ml, rep(varnames[i], nlevels(sup[[i]])))
}
result$variable.sup <- ml
}
result$subset.var <- Qm
Nmodal <- result$n.mod
Nsup <- sum(result$freq.sup != 0)
Nid <- result$n.ind
#######################################################
# Mass matrix
#######################################################
tmp <- data.frame(result$variable.all, result$mass.mod.all)
mass.all <- matrix( , nrow = length(unique(varlist)), ncol = 4)
for (i in 1:length(unique(varlist))) {
mass.all[i,4] <- as.numeric(sum(tmp[which(result$variable.all == unique(result$variable.all)[i]),2]))
}
tmp2 <- data.frame(result$variable, result$mass.mod)
mass.act <- vector()
for (i in 1:length(unique(varlist))) {
mass.act[i] <- as.numeric(sum(tmp2[which(result$variable == unique(result$variable)[i]),2]))
}
mass.all[,1] <- mass.act
mass.all[,2] <- mass.all[,4] - mass.all[,1]
mass.all[,3] <- mass.all[,2] / mass.all[,4] * 100
rownames(mass.all) <- unique(result$variable.all)
colnames(mass.all) <- c("Active mass", "Passive mass", "% of mass passive" ,"Total mass")
result$mass <- round(mass.all,10)
tmp <- data.frame(result$variable, rowSums(result$ctr.mod.raw))
Share.of.var <- vector()
for (i in 1:length(unique(varlist))) {
Share.of.var[i] <- sum(tmp[which(result$variable == unique(result$variable)[i]),2]) / (result$total.inertia.raw)
}
names(Share.of.var) <- unique(result$variable)
Share.of.var <- data.frame("Active modalities" = result$modal[, "Nb. active modalities"], "Share of variance" = round(Share.of.var,3), check.names = F)
rownames(Share.of.var) <- rownames(result$modal)
result$Share.of.var <- Share.of.var
median.standard <- function(result) {
coord.ind <- result$coord.ind
coord.median <- apply(coord.ind, 2, median)
dim.ind <- seq(ncol(coord.ind))[coord.median > 0]
result <- invert(result, dim.ind)
return(result)
}
result <- median.standard(result)
# Cleaning
if(identical(detailed.results, FALSE)){
result$svd.u <- NULL
result$cor.mod.all.raw <- NULL
result$ctr.mod.all.raw <- NULL
#result$ctr.mod.raw <- NULL
result$eigen.raw <- NULL
result$indicator.matrix.all <- NULL
result$indicator.matrix.trans <- NULL
}
class(result) <- "soc.mca"
return(result)
}
# ' Correspondence analysis on a indicator matrix
# '
# ' This function is part of the soc.mca function but allows for manipulation of the indicator matrix before analysis.
# ' Most users will not need this function.
# '
# ' @param ind.act An indicator matrix of all the active modalities (including those that are to be set as passive)
# ' @param ind.sup An indicator matrix of the supplementary modalities
# ' @param subset A vector containing column indices of passive modalities
# ' @param Q The number of variables
# ' @param Qm The number of variables without passive modalities
# ' #@export
# ' @return a list of various results. See \link{soc.mca} documentation
subset.ca.indicator <- function (ind.act, ind.sup, active.set, passive.set, Q, Qm,
Moschidis, detailed.results = FALSE, weight = NULL) {
if(is.null(weight)) weight <- rep(1, nrow(ind.act))
Z.act <- ind.act
Z.sup <- ind.sup
row.w <- weight/sum(weight) * nrow(Z.act) # a vector of row weights
#colZ <- colSums(Z.act)
if (identical(Moschidis, TRUE)) {
Y <- vector()
cnZ.act <- colnames(Z.act)
varlist <- gsub(pattern = ": .*", "", cnZ.act)
for (i in seq(Q)) {
x <- rep(table(varlist)[i], table(varlist)[i])
Y <- c(Y, x)
}
Z.act <- Z.act %*% diag(1/(Y - 1))
colnames(Z.act) <- cnZ.act
}
I <- dim(Z.act)[1]
J <- dim(Z.act)[2]
# P <- Z.act/sum(Z.act)
Z.act_w <- row.w * Z.act
n_k <- colSums(Z.act_w)
n_kp <- n_k[active.set] # subsetting modality frequencies to remove passive modalities
NK <- diag(n_k)
NKp <- NK[active.set, active.set] # subsetting modality frequencies to remove passive modalities
n <- nrow(Z.act)
Z0 <- sweep(Z.act, 2, n_k/n) # subtracts the relative frequency of the modality (nk/n) from the k-column dirac meassures (1/0)
Z0 <- as.matrix(Z0)
Z0t <- Z0[, active.set] # subsetting to remove passive modalities = Z̃0
H0t <- sqrt(row.w) * (1/sqrt(Q)) * Z0t %*% diag(1/sqrt(n_kp)) ## I don't understand what the squared row.w term is doing.
dec <- svd(H0t)
# cm.all <- colSums(P)
cm.all <- n_k/n
cm.all[passive.set] <- 0
mass.mod.all <- cm.all
rm <- row.w/n
#diag.cm.all <- diag(1/sqrt(cm.all))
#eP <- rm %*% t(cm.all)
#S <- (P - eP)/sqrt(eP)
K <- length(active.set)
#S <- S[, active.set]
#cm <- cm.all[active.set]
fk <- n_kp/n
cm <- n_kp/n / Q
#cm.all[passive.set] <- 0
#diag.cm <- diag.cm.all[active.set, active.set]
#diag.cm.all[passive.set, passive.set] <- 0
#dec <- svd(S)
eigen <- dec$d^2
pc.mod <- sqrt(n * Q) * diag(1/sqrt(n_kp)) %*% dec$v %*% diag(dec$d)
#pc.mod <- diag.cm %*% dec$v %*% diag(dec$d)
#diag.rm <- Matrix::Diagonal(x = 1/sqrt(rm))
pc.ind <- (1/sqrt(row.w)) * sqrt(n) * dec$u %*% diag(dec$d)
#pc.ind <- diag.rm %*% dec$u %*% diag(dec$d)
inr.ind <- row.w/n * pc.ind^2
#inr.ind <- Matrix::Diagonal(x = row.w/n) %*% pc.ind^2
inr.mod <- (fk/Q) * pc.mod^2
#inr.mod <- diag(cm) %*% pc.mod^2
ctr.ind <- Matrix::t(Matrix::t(inr.ind)/dec$d^2)
ctr.mod <- t(t(inr.mod)/dec$d^2)
ctr.mod.raw <- inr.mod
cor.ind <- inr.ind/Matrix::rowSums(inr.ind)
cor.mod <- inr.mod/rowSums(inr.mod)
cor.mod.raw <- inr.mod
if (sum(ind.sup) > 0) {
Z.sup <- Z.sup[, which(colSums(Z.sup) > 0)]
Z.star <- Z.sup
I.star <- dim(Z.sup)[1]
cs.star <- apply(Z.sup, 2, sum)
base <- Z.star/matrix(rep(cs.star, I.star), nrow = I.star,
byrow = TRUE)
f.s1 <- dec$u * sqrt(eigen)/sqrt(rm)
a.s1 <- f.s1/sqrt(eigen)
pc.sup <- t(base) %*% a.s1
t <- matrix(NA, nrow = nrow(pc.sup), ncol = ncol(pc.sup))
for (j in 1:ncol(pc.sup)) {
for (i in 1:nrow(pc.sup)) {
t[i, j] <- round(sqrt(cs.star[i] * ((I - 1)/(I -
cs.star[i]))) * pc.sup[i, j], 5)
}
}
}
else {
pc.sup <- NULL
t.plane <- NULL
t <- NULL
}
pc.all <- pc.mod
ctr.mod.all <- ctr.mod
cor.mod.all <- cor.mod
ctr.mod.all.raw <- ctr.mod.raw
cor.mod.all.raw <- cor.mod.raw
if (length(active.set) < ncol(Z.act)) {
cs.star.all <- n_k
tmp.ctr <- ctr.mod
tmp.cor <- cor.mod
tmp.ctr.raw <- ctr.mod.raw
tmp.cor.raw <- cor.mod.raw
rownames(tmp.ctr) <- colnames(Z.act)[active.set]
rownames(tmp.cor) <- colnames(Z.act)[active.set]
rownames(tmp.ctr.raw) <- colnames(Z.act)[active.set]
rownames(tmp.cor.raw) <- colnames(Z.act)[active.set]
passive.mod <- setdiff(names(cs.star.all), rownames(tmp.ctr))
passive.modalities <- matrix(0, nrow = length(passive.mod),
ncol = length(eigen))
rownames(passive.modalities) <- passive.mod
tmp.ctr <- rbind(tmp.ctr, passive.modalities)
tmp.cor <- rbind(tmp.cor, passive.modalities)
tmp.ctr.raw <- rbind(tmp.ctr.raw, passive.modalities)
tmp.cor.raw <- rbind(tmp.cor.raw, passive.modalities)
if (identical(detailed.results, TRUE)) {
pc.all <- matrix(0, nrow = dim(Z.act)[2], ncol = length(eigen))
ctr.mod.all <- matrix(0, nrow = dim(Z.act)[2], ncol = length(eigen))
cor.mod.all <- matrix(0, nrow = dim(Z.act)[2], ncol = length(eigen))
ctr.mod.all.raw <- matrix(0, nrow = dim(Z.act)[2],
ncol = length(eigen))
cor.mod.all.raw <- matrix(0, nrow = dim(Z.act)[2],
ncol = length(eigen))
for (j in 1:length(eigen)) {
for (i in 1:length(colnames(Z.act))) {
pc.all[i, j] <- sum(pc.ind[Z.act[, i] > 0,
j])/cs.star.all[i]/sqrt(eigen[j])
ctr.mod.all[i, j] <- tmp.ctr[rownames(tmp.ctr) ==
colnames(Z.act)[i], j]
cor.mod.all[i, j] <- tmp.cor[rownames(tmp.cor) ==
colnames(Z.act)[i], j]
ctr.mod.all.raw[i, j] <- tmp.ctr.raw[rownames(tmp.ctr.raw) ==
colnames(Z.act)[i], j]
cor.mod.all.raw[i, j] <- tmp.cor.raw[rownames(tmp.cor.raw) ==
colnames(Z.act)[i], j]
}
}
}
}
R1.dim <- K - Qm
R1.eigen.val <- eigen[1:R1.dim]
Mean.r.eigen <- ((K/Q) - sum(cm))/R1.dim
R2.eigen.val <- eigen[eigen >= Mean.r.eigen]
R2.dim <- length(R2.eigen.val)
unadj.var <- 100 * (eigen[1:R1.dim])/sum(eigen[1:R1.dim])
adj.var.mod <- (Q/(Q - 1))^2 * (eigen[1:R2.dim] - Mean.r.eigen)^2
sum.adj.var <- sum(adj.var.mod)
adj.var <- round(((adj.var.mod/sum.adj.var) * 100), digits = 1)
cumpercent <- cumsum(adj.var)
adj.inertia <- cbind(1:R2.dim, round(eigen[1:R2.dim], 3),
round(unadj.var[1:R2.dim], 1), adj.var, cumpercent)
colnames(adj.inertia) <- c("Dim", "Eigen", "Var", "Adj.Var",
"Cum%")
adj.inertia2 <- cbind(1:R1.dim, round(eigen[1:R1.dim], 3),
round(unadj.var, 1), round(cumsum(unadj.var), 1))
colnames(adj.inertia2) <- c("Dim", "Eigen", "Unadj.Var%",
"Cum%")
freq.mod.all <- n_k
freq.mod <- n_kp
if (sum(ind.sup) > 0) {
freq.sup <- colSums(Z.sup)
}
else {
freq.sup <- NULL
}
ca.output <- list(nd = R2.dim, n.ind = nrow(Z.act), n.mod = length(active.set),
eigen = eigen[1:R2.dim], eigen.raw = eigen, Qm = Qm,
Q = Q, total.inertia = sum(eigen[1:R2.dim]), total.inertia.raw = sum(eigen),
adj.inertia = adj.inertia, inertia_full = adj.inertia2,
freq.mod = freq.mod, freq.mod.all = freq.mod.all, freq.sup = freq.sup,
ctr.ind = as.matrix(ctr.ind), ctr.mod.raw = ctr.mod.raw,
cor.mod = cor.mod, cor.ind = as.matrix(cor.ind), mass.mod = cm,
mass.mod.all = mass.mod.all, coord.mod = pc.mod, coord.ind = as.matrix(pc.ind),
coord.sup = pc.sup, t.test.sup = t, ctr.mod = ctr.mod,
indicator.matrix.trans = Z.act_w)
if (identical(detailed.results, TRUE)) {
ca.output$ctr.mod.all = ctr.mod.all
ca.output$ctr.mod.all.raw = ctr.mod.all.raw
ca.output$cor.mod.raw = cor.mod.raw
ca.output$cor.mod.all = cor.mod.all
ca.output$cor.mod.all.raw = cor.mod.all.raw
ca.output$coord.all = pc.all
ca.output$full <- svd(H0t, nu = nrow(H0t), nv = ncol(H0t))
}
names(ca.output$mass.mod) <- NULL
dimnames(ca.output$coord.sup) <- NULL
names(ca.output$freq.mod) <- NULL
names(ca.output$freq.sup) <- NULL
return(ca.output)
}
#' Indicator matrix
#'
#' Creates an indicator matrix from a data.frame with the categories of the questions as columns and individuals as rows.
#'
#' @param x a data.frame of factors
#' @param id a vector defining the labels for the individuals. If id = NULL row number is used.
#' @param ps the seperator used in the creation of the names of the columns.
#'
#' @return Returns a indicator matrix
#' @seealso \link{soc.mca}
#' @examples
#' a <- rep(c("A","B"), 5)
#' b <- rep(c("C", "D"), 5)
#' indicator(data.frame(a,b))
#' @export indicator
indicator <- function(x, id = NULL, ps = ": "){
obj <- x
I <- nrow(obj) # Number of individuals
levels.n <- unlist(lapply(obj, nlevels))
n <- cumsum(levels.n) # Number of modalities for each question
m <- max(n) # Total number of modalities
Q <- ncol(obj) # Number of questions
Z <- matrix(0, nrow = I, ncol = m) # Predefinition of the indicatormatrix
newdat <- lapply(obj, as.numeric)
offset <- (c(0, n[-length(n)]))
for (i in seq(Q)) Z[seq(I) + (I * (offset[i] + newdat[[i]] - 1))] <- 1 # Indicator matrix
fn <- rep(names(obj), unlist(lapply(obj, nlevels)))
ln <- unlist(lapply(obj, levels))
col.names <- paste(fn, ln, sep = ps)
colnames(Z) <- col.names
if (identical(id, NULL) == TRUE){
rownames(Z) <- as.character(seq(I))
}else{
rownames(Z) <- id
}
return(Z)
}
#' Class Specific Multiple Correspondence Analysis
#'
#' \code{soc.csa} performs a class specific multiple correspondence analysis on a data.frame of factors, where cases are rows and columns are variables. Most descriptive and analytical functions that work for \link{soc.mca}, also work for \code{soc.csa}
#' @param object is a soc.ca class object created with \link{soc.mca}
#' @param class.indicator the row indices of the class specific individuals
#' @param sup Defines the supplementary modalities in a data.frame with rows of individuals and columns of factors, without NA's
#' @return \item{nd}{Number of active dimensions}
#' \item{n.ind}{The number of active individuals}
#' \item{n.mod}{The number of active modalities}
#' \item{eigen}{Eigenvectors}
#' \item{total.inertia}{The sum of inertia}
#' \item{adj.inertia}{A matrix with all active dimensions, adjusted and unadjusted inertias. See \link{variance}}
#' \item{freq.mod}{Frequencies for the active modalities. See \link{add.to.label}}
#' \item{freq.sup}{Frequencies for the supplementary modalities. See \link{add.to.label}}
#' \item{ctr.mod}{A matrix with the contribution values of the active modalities per dimension. See \link{contribution}}
#' \item{ctr.ind}{A matrix with the contribution values of the individuals per dimension.}
#' \item{cor.mod}{The correlation or quality of each modality per dimension.}
#' \item{cor.ind}{The correlation or quality of each individual per dimension.}
#' \item{mass.mod}{The mass of each modality}
#' \item{coord.mod}{A matrix with the principal coordinates of each active modality per dimension.}
#' \item{coord.ind}{A matrix with the principal coordinates of each individual per dimension.}
#' \item{coord.sup}{A matrix with the principal coordinates of each supplementary modality per dimension. Notice that the position of the supplementary modalities in class specific analysis is the mean point of the individuals, which is not directly comparable with the cloud of the active modalities.}
#' \item{indicator.matrix}{A indicator matrix. See \link{indicator}}
#' \item{names.mod}{The names of the active modalities}
#' \item{names.ind}{The names of the individuals}
#' \item{names.sup}{The names of the supplementary modalities}
#' \item{names.passive}{The names of the passive modalities}
#' \item{modal}{A matrix with the number of modalities per variable and their location}
#' \item{variable}{A vector with the name of the variable for each of the active modalities}
#' \item{variable.sup}{A vector with the name of the variable for each of the supplementary modalities}
#' \item{original.class.indicator}{The class indicator}
#' \item{original.result}{The original soc.ca object used for the CSA}
#' @name soc.csa
#' @export soc.csa
#' @author Anton Grau Larsen, University of Copenhagen
#' @author Stefan Bastholm Andrade, University of Copenhagen
#' @author Christoph Ellersgaard, University of Copenhagen
#' @seealso \link{add.to.label}, \link{contribution}
#' @references Le Roux, B., og H. Rouanet. 2010. Multiple correspondence analysis. Thousand Oaks: Sage.
#' @examples
#' example(soc.ca)
#' class.age <- which(taste$Age == '55-64')
#' res.csa <- soc.csa(result, class.age)
#' res.csa
soc.csa <- function(object, class.indicator, sup = NULL){
Z.act <- object$indicator.matrix.active # Original indicator matrix
Q <- nlevels(as.factor(object$variable)) # Number of questions
I <- nrow(Z.act) # Original number of individuals
Z.hat <- Z.act[class.indicator, ] # Indicator matrix for the CSA
i <- length(class.indicator) # Number of individuals in CSA
cm <- apply(Z.hat, 2, sum)
CM <- apply(Z.act, 2, sum)
P <- Z.hat / sum(Z.hat)
cmpc <- colSums(P) # Column (modality) mass
rmpc <- rep(1/i,i)
H.hat <- matrix(, nrow = i, ncol = length(cm))
for (k in seq(cm)){
H.hat[,k] <- (1/sqrt(Q)) * (sqrt(I/i)) * (Z.hat[, k]-(cm[k]/i)) * (1/sqrt(CM[k]))
}
colnames(H.hat) <- colnames(Z.hat)
modal.names <- colnames(Z.hat)
H.svd <- svd(H.hat)
dec <- H.svd
### Modalitetskoordinater
csa.m.coord <- matrix(nrow = nrow(H.svd$v), ncol = ncol(H.svd$v))
for (ff in 1:length(H.svd$d)){
csa.m.coord[, ff] <- (sqrt(Q*I)) * (1/sqrt(CM)) * H.svd$v[, ff] * H.svd$d[ff]
}
rownames(csa.m.coord) <- modal.names
### Modalitetsbidrag
csa.m.ctr <- matrix(nrow = nrow(H.svd$v), ncol = ncol(H.svd$v))
for (ff in 1:length(H.svd$d)){
csa.m.ctr[, ff] <- (((CM/I)/Q) * (csa.m.coord[, ff])^2)/(H.svd$d[ff]^2)
}
csa.m.ctr.raw <- ((CM/I)/Q) * (csa.m.coord^2)
csa.m.cor <- csa.m.ctr.raw/rowSums(csa.m.ctr.raw)
### Individkoordinater
csa.i.coord <- matrix(nrow = nrow(H.svd$u), ncol = ncol(H.svd$u))
for (ff in 1:length(H.svd$d)){
csa.i.coord[, ff] <- sqrt(i) * H.svd$u[, ff] * H.svd$d[ff]
}
### Individbidrag
csa.i.ctr <- matrix(nrow = nrow(H.svd$u), ncol = ncol(H.svd$u))
for (ff in 1:length(H.svd$d)){
csa.i.ctr[, ff] <- ((1/i) * (csa.i.coord[, ff])^2)/(H.svd$d[ff]^2)
}
csa.i.ctr.raw <- (1/i) * (csa.i.coord^2)
csa.i.cor <- csa.i.ctr.raw/rowSums(csa.i.ctr.raw)
# Eigenvectors
eigen <- H.svd$d^2
###############################################
#### Dimension reduction and explained variance
K <- ncol(Z.act)
Qm <- object$subset.var
# First reduction of dimensionality
R1.dim <- K-Qm
R1.eigen.val <- eigen[1:R1.dim]
# Second reduction of dimensionality
Mean.r.eigen <- mean(R1.eigen.val, na.rm = TRUE)
R2.eigen.val <- eigen[eigen >= Mean.r.eigen]
R2.dim <- which(R2.eigen.val >= Mean.r.eigen)
# Explained variance
unadj.var <- 100*(eigen[1:R1.dim])/sum(eigen[1:R1.dim]) # Unadjusted rates of variance
sum.adj.var.mod <- (eigen[R2.dim]-Mean.r.eigen)^2
sum.adj.var <- sum(sum.adj.var.mod)
adj.var <- round(((sum.adj.var.mod/sum.adj.var)*100), digits = 1) # The adjusted rates of variance
cumpercent <- cumsum(adj.var)
adj.inertia <- cbind(R2.dim, round(eigen[R2.dim], 3), round(unadj.var[R2.dim], 1), adj.var ,cumpercent)
colnames(adj.inertia) <- c("Dim", "Eigen", "Var" ,"Adj.Var", "Cum%")
###################
freq.mod <- cm
###############
############
# Preparing for object object
names.mod <- modal.names
names.ind <- object$names.ind[class.indicator]
names.passive <- object$names.passive
############
# Supplementary variables
# They are projected into the cloud of individuals - NOT the cloud of modalities
sup.coord <- NULL
freq.sup <- 0
names.sup <- NULL
if(identical(sup,NULL) == FALSE){
sup.coord <- matrix(NA, nrow= ncol(sup), ncol = ncol(csa.i.coord))
rownames(sup.coord) <- colnames(sup)
freq.sup <- colSums(sup)
names.sup <- colnames(sup)
for (j in 1:ncol(csa.i.coord)){
d1 <- csa.i.coord[,j] #/ H.svd$d[j]
sup.c <- sup
sup.c[] <- 1
sup.c[sup == 0] <- 0
sup.c <- sup.c * d1
sup.coord[, j] <- (1 / freq.sup) * apply(sup.c, 2, sum, na.rm = TRUE)
}
sup.coord[which(freq.sup == 0),] <- 0
}
#############
# Result object
csca.result <- list(
nd = object$nd,
n.ind = i,
n.ind.org = I,
n.mod = ncol(Z.hat),
eigen = eigen,
total.inertia = sum(eigen[R2.dim]),
adj.inertia = adj.inertia,
freq.mod = freq.mod,
freq.mod.org = object$freq.mod,
freq.sup = freq.sup,
ctr.mod = csa.m.ctr,
ctr.ind = csa.i.ctr,
cor.mod = csa.m.cor, #cor.mod[, R2.dim], cor.ind = cor.ind[, R2.dim], # Not implemented
cor.ind = csa.i.cor,
mass.mod = cmpc, # Massen er etchy - se nedenfor
mass.ind = rmpc, # Massen er etchy - se nedenfor
coord.mod = csa.m.coord,
#coord.mod = pc.mod,
coord.ind = csa.i.coord,
coord.sup = sup.coord,
names.mod = names.mod,
names.ind = names.ind,
names.sup = names.sup,
names.passive = names.passive,
indicator.matrix = Z.hat,
modal = object$modal,
variable = object$variable,
variable.sup = "Not Implemented",
variable.all = object$variable.all,
headings.all = object$headings.all,
headings = object$headings,
subset.var = object$subset.var,
original.result = object,
original.class.indicator = class.indicator,
svd.d = H.svd$d,
svd.u = H.svd$u,
svd.v = H.svd$v
)
median.standard <- function(object){
coord.ind <- object$coord.ind
coord.median <- apply(coord.ind, 2, median)
dim.ind <- seq(ncol(coord.ind))[coord.median > 0]
object <- invert(object, dim.ind)
return(object)
}
csca.result <- median.standard(csca.result)
#####################################
# Class and return
class(csca.result) <- c("soc.mca", "soc.csa")
return(csca.result)
}
#' Create categories according to the quadrant position of each individual
#'
#' Creates a vector from two dimensions from a soc.ca object. Labels are the
#' cardinal directions with the first designated dimension running East - West.
#' The center category is a circle defined by \code{cut.radius}.
#'
#' @param object a soc.ca class object
#' @param dim the dimensions
#' @param cut.min Minimum cut value
#' @param cut.max Maximum cut value
#' @param cut.radius Radius of the center category
#'
#' @return Returns a character vector with category memberships
#' @seealso \link{soc.mca}
#' @export create.quadrant
#'
#' @examples
#' example(soc.ca)
#' create.quadrant(result, dim = c(2, 1))
#' table(create.quadrant(result, dim = c(1, 3), cut.radius = 0.5))
#'
create.quadrant <- function(object, dim = c(1,2), cut.min = -0.125, cut.max = 0.125, cut.radius = 0.25){
coord <- object$coord.ind
coord.cut <- coord
coord.cut[coord < cut.min] <- "Min"
coord.cut[coord > cut.max] <- "Max"
coord.cut[coord <= cut.max & coord >=cut.min] <- "Medium"
dim1 <- coord.cut[,dim[1]]
dim2 <- coord.cut[,dim[2]]
distance <- sqrt(((coord[, dim[1]]^2) + (coord[, dim[2]]^2)))
position <- dim1
position[dim1 == "Max" & dim2 == "Max"] <- "North-East"
position[dim1 == "Max" & dim2 == "Medium"] <- "East"
position[dim1 == "Max" & dim2 == "Min"] <- "South-East"
position[dim1 == "Medium" & dim2 == "Medium"] <- "Center"
position[dim1 == "Medium" & dim2 == "Min"] <- "South"
position[dim1 == "Medium" & dim2 == "Max"] <- "North"
position[dim1 == "Min" & dim2 == "Max"] <- "North-West"
position[dim1 == "Min" & dim2 == "Medium"] <- "West"
position[dim1 == "Min" & dim2 == "Min"] <- "South-West"
position[distance < cut.radius] <- "Center"
return(position)
}
#' Multiple Class Specific Correspondence Analysis on all values in a factor
#'
#' \code{csa.all} performs a class specific correspondence analysis for each
#' level in a factor variable. Returns a list with soc.csa objects and a list of
#' measures defined by \link{csa.measures}
#' @param object is a soc.ca class object created with \link{soc.mca}
#' @param variable a factor with the same length and order as the active
#' variables that created the soc.ca object
#' @param dim is the dimension analyzed
#' @param ... further arguments are directed to \link{csa.measures}
#' @return \item{results}{a list of \link{soc.csa} result objects}
#' @return \item{cor}{a list of correlation matrixes}
#' @return \item{cosines}{a list of matrixes with cosine values}
#' @return \item{angles}{a list of matrixes with cosine angles between
#' dimensions}
#' @export
#' @examples
#' example(soc.ca)
#' csa.all(result, taste$Age)
#' csa.all(result, taste$Age)$measures
#' @seealso \link{soc.csa}, \link{cor}, \link{csa.measures}
csa.all <- function(object, variable, dim = 1:5, ...){
lev.variable <- levels(variable)
result.list <- list()
for (i in 1:length(lev.variable)){
dummy.class <- which(variable == lev.variable[i])
result.list[[i]] <- soc.csa(object, class.indicator = dummy.class)
}
names(result.list) <- lev.variable
measure.list <- lapply(result.list, csa.measures, format = FALSE, dim.csa = dim, ...)
list(results = result.list, measures = measure.list)
}
#' Add supplementary individuals to a result object
#'
#' @param object is a soc.ca class object created with \link{soc.mca}
#' @param sup.indicator is a indicator matrix for the supplementary individuals with the same columns as the active variables in object.
#' @param replace if TRUE the coordinates of the active individuals are discarded. If FALSE the coordinates of the supplementary and active individuals are combined. The factor \code{object$supplementary.individuals} marks the supplementary individuals.
#' @return a soc.ca class object created with \link{soc.mca}
#' @export
#' @examples
#' example(soc.mca)
#' res.pas <- soc.mca(active, passive = "Costume")
#' res.sup <- supplementary.individuals(res.pas, sup.indicator = indicator(active))
#' a <- res.sup$coord.ind[res.sup$supplementary.individuals == "Supplementary",]
#' b <- res.pas$coord.ind
#' all.equal(as.vector(a), as.vector(b))
#' map.ind(res.sup)
supplementary.individuals <- function(object, sup.indicator, replace = FALSE){
if (length(object$names.passive) > 0) sup.indicator <- sup.indicator[, -which(colnames(sup.indicator) %in% object$names.passive)]
# sup.ind.dim <- function(object, sup.indicator, dim){
# Q <- length(table(object$variable))
# yk <- t(object$coord.mod[, dim] * t(sup.indicator))
# #Q.ind <- yk / rowSums(sup.indicator)
# Q.ind <- yk / Q
# out <- 1/sqrt(object$eigen[dim]) * rowSums(Q.ind)
# out
# }
sup.ind.dim <- function(object, sup.indicator, dim){
Q <- length(table(object$variable))
yk <- t(object$coord.mod[, dim] * t(sup.indicator))
pk <- (colSums(object$indicator.matrix.active)/object$n.ind) / Q
sas <- pk * object$coord.mod[, dim]
Q.ind <- yk / Q
out <- 1/sqrt(object$eigen[dim]) * (rowSums(Q.ind) - sum(sas))
out
}
ndim <- 1:ncol(object$coord.ind)
sup.ind.coord <- sapply(ndim, sup.ind.dim, object = object, sup.indicator = sup.indicator)
if(identical(replace, FALSE)){
object$coord.ind <- rbind(object$coord.ind, sup.ind.coord)
rownames(object$coord.ind) <- NULL
object$supplementary.individuals <- c(rep("Active", object$n.ind), rep("Supplementary", nrow(sup.indicator)))
object$names.ind <- c(object$names.ind, rownames(sup.indicator))
}
if(identical(replace, TRUE)){
object$coord.ind <- sup.ind.coord
rownames(object$coord.ind) <- NULL
object$names.ind <- rownames(sup.indicator)
}
object
}
# # Test of supplementary individuals
# example(soc.mca)
#
# # Ingen passive modaliteter
# res.sup <- supplementary.individuals(result, sup.indicator = indicator(active))
# a <- res.sup$coord.ind[res.sup$supplementary.individuals == "Supplementary",]
# b <- result$coord.ind
# all.equal(as.vector(a), as.vector(b))
# map.ind(res.sup)
#
# # Med passive modaliteter
# res.pas <- soc.mca(active, passive = "Costume")
# res.sup <- supplementary.individuals(res.pas, sup.indicator = indicator(active))
# a <- res.sup$coord.ind[res.sup$supplementary.individuals == "Supplementary",]
# b <- res.pas$coord.ind
# all.equal(as.vector(a), as.vector(b))
# map.ind(res.sup)
#
# # Med passive modaliteter
# res.pas <- soc.mca(active, passive = "Costume")
# res.sup <- supplementary.individuals(res.pas, sup.indicator = indicator(active)[1:10,])
# a <- res.sup$coord.ind[res.sup$supplementary.individuals == "Supplementary",]
# b <- res.pas$coord.ind[1:10,]
# all.equal(as.vector(a), as.vector(b))
# map.ind(res.sup, point.fill = res.sup$supplementary.individuals, label = T)
#
# # Med passive modaliteter
# sup.indicator <- indicator(active)[1:10,]
# rownames(sup.indicator) <- paste("Sup", 1:10)
# res.pas <- soc.mca(active, passive = "Costume")
# res.sup <- supplementary.individuals(res.pas, sup.indicator = sup.indicator)
# a <- res.sup$coord.ind[res.sup$supplementary.individuals == "Supplementary",]
# b <- res.pas$coord.ind[1:10,]
# all.equal(as.vector(a), as.vector(b))
# map.ind(res.sup, point.fill = res.sup$supplementary.individuals, label = T)
#' Check if data is valid for soc.mca
#'
#' Performs tests on what has been passed on to soc.mca by the user.
#'
#' @param x the active variables sent to soc.mca
#'
#' @return a character vector with an evaluation of whether x is data.frame, a list of data.frames, an indicator or a list of indicators.
#'
#' @examples
#' \dontrun{
#' # Valid scenarios ----
#'
#' # X is a valid data.frame
#' x <- taste[, 2:7]
#' what.is.x(x)
#'
#' # X is a valid indicator
#' x <- indicator(taste[, 2:7])
#' what.is.x(x)
#'
#' # X is a valid list of data.frames with names
#' x <- list(nif = taste[, 2:3], hurma = taste[, 4:5])
#' what.is.x(x)
#'
#' # X is a valid list of indicators
#' x <- list(nif = indicator(taste[, 2:3]), hurma = indicator(taste[, 4:5]))
#' what.is.x(x)
#'
#' # Invalid scenarios ----
#'
#' # X is a matrix - but not numeric
#' x <- as.matrix(taste[, 2:7])
#' what.is.x(x)
#'
#' # X is a of data.frames list but does not have names
#' x <- list(taste[, 1:3], taste[, 4:5])
#' what.is.x(x)
#'
#' # X is a list of indicators but does not have names
#' x <- list(indicator(taste[, 2:3]), indicator(taste[, 4:5]))
#' what.is.x(x)
#'
#' # X is a data.frame and contains NA
#' x <- taste[, 2:7]
#' x[1,1] <- NA
#' what.is.x(x)
#'
#' # X is a list of indicators and contains NA
#' x <- list(nif = indicator(taste[, 2:3]), hurma = indicator(taste[, 4:5]))
#' x[[1]][1,1] <- NA
#' what.is.x(x)
#'
#' # X contains elements that are neither a matrix nor a data.frame
#' x <- list(nif = 1:10, taste[, 1:3], taste[, 4:7])
#' what.is.x(x)
#'
#' # X contains both indicators and matrixes
#' x <- list(nif = taste[, 2:3], hurma = indicator(taste[, 5:6]))
#' what.is.x(x)
#' }
what.is.x <- function(x){
# Is it a list of data.frames?
is.d <- is.data.frame(x)
is.l <- is.list(x)
is.m <- is.matrix(x)
is.lm <- all(unlist(lapply(x, is.matrix)))
is.ld <- is.d == FALSE & is.l == TRUE & is.lm == FALSE
# Scenario: List
if(is.ld | is.lm){
not.all.data.frames <- any(!unlist(lapply(x, is.data.frame)))
not.all.matrix <- any(!unlist(lapply(x, is.matrix)))
if(all(not.all.data.frames, not.all.matrix)) stop("x is a list, but not all elements are the same valid type (aka. data.frame or indicator matrix).")
l.nam <- names(x)
if(unique(length(l.nam)) != length(x)) stop("x is a list, but each element (heading) does not have a unique name.")
}
# Scenario: Indicator
if(is.m == TRUE & is.numeric(x) == FALSE) stop("Your data is a matrix, but it is not numeric.")
# Check for missing
if(anyNA(x, recursive = TRUE)) stop("Your data includes NA. Try recoding to missing.")
# Is it a list of only matrixes?
o <- NA
# It is surely a data.frame
if(is.d) o <- "data.frame"
# It is surely an indicator matrix
if(is.m) o <- "indicator"
# It is surely a list of data.frames
if(is.ld) o <- "list.data.frame"
# It is surely a list of indicators
if(is.lm) o <- "list.indicators"
o
}
balance.headings <- function(indicator, headings, Q) {
heads <- unique(headings)
n_head <- length(heads)
mass_by_head <- Q / n_head
Q_prH <- sapply(heads, function(x) max(rowSums(indicator[, headings == x])))
weight <- Q_prH / mass_by_head
for(i in 1:length(heads)) {
indicator[ ,headings == heads[i]] <- indicator[ ,headings == heads[i]] / weight[i]
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.