mixedMem.R
In mixedMem: Tools for Discrete Multivariate Mixed Membership Models

## ----echo = FALSE, warning=FALSE----------------------------------------------
library(xtable, quietly = TRUE)

## -----------------------------------------------------------------------------
library(mixedMem)
data(ANES)
# Dimensions of the data set: 279 individuals with 19 responses each
dim(ANES)
# The 19 variables and their categories
# The specific statements for each variable can be found using help(ANES)
# Variables titled EQ are about Equality
# Variables titled IND are about Econonic Individualism
# Variables titled ENT are about Free Enterprise
colnames(ANES)
# Distribution of responses
table(unlist(ANES))

## -----------------------------------------------------------------------------
# Sample Size
Total <- 279
# Number of variables
J <- 19 
# we only have one replicate for each of the variables
Rj <- rep(1, J)
# Nijr indicates the number of ranking levels for each variable.
# Since all our data is multinomial it should be an array of all 1s
Nijr <- array(1, dim = c(Total, J, max(Rj)))
# Number of sub-populations
K <- 3
# There are 3 choices for each of the variables ranging from 0 to 2.
Vj <- rep(3, J)
# we initialize alpha to .2
alpha <- rep(.2, K)
# All variables are multinomial
dist <- rep("multinomial", J)
# obs are the observed responses. it is a 4-d array indexed by i,j,r,n
# note that obs ranges from 0 to 2 for each response
obs <- array(0, dim = c(Total, J, max(Rj), max(Nijr)))
obs[, , 1, 1] <- as.matrix(ANES)

# Initialize theta randomly with Dirichlet distributions
set.seed(123)
theta <- array(0, dim = c(J, K, max(Vj)))
for (j in 1:J) {
    theta[j, , ] <- gtools::rdirichlet(K, rep(.8, Vj[j]))
}

# Create the mixedMemModel
# This object encodes the initialization points for the variational EM algorithim
# and also encodes the observed parameters and responses
initial <- mixedMemModel(Total = Total, J = J, Rj = Rj,
                         Nijr = Nijr, K = K, Vj = Vj, alpha = alpha,
                         theta = theta, dist = dist, obs = obs)

## -----------------------------------------------------------------------------
computeELBO(initial)
st = proc.time()
#printStatus 1 indicates that status updates will be printed
# printMod 25 indicates that status updates will only be printed ever 25th step
out <- mmVarFit(initial, printStatus = 1, printMod = 25)
end <- proc.time()
computeELBO(out)
time <- end - st

## -----------------------------------------------------------------------------
summary(out)

## -----------------------------------------------------------------------------
#read in GM estimates
# Note that Gross and Manrique-Vallier do not report estimates for the third group, so they have been set to 0 in the data
data(gmv_theta)
#
optimal.perm <- findLabels(out, gmv_theta)
# display the permutation as well as the weighted squared error loss
optimal.perm
# save object with permuted labels
out.permute <- permuteLabels(out, optimal.perm$perm)

## -----------------------------------------------------------------------------
# calculate the expected value
model.exp <- matrix(0, nrow = J, ncol = max(Vj))
for(j in 1:J){
  model.exp[j, ] <- (out.permute$alpha / sum(out.permute$alpha)) %*% out.permute$theta[j, , ]
}
model.exp <- model.exp*out.permute$Total

# aggregate the observed counts
obs.counts <- t(apply(obs[, , 1, 1] + 1, MARGIN = 2, tabulate))
chi.squared <- sum( (obs.counts - model.exp)^2 / model.exp)

## ----results = 'asis', echo = FALSE-------------------------------------------
tab <- cbind(model.exp[, 1], obs.counts[, 1],
             model.exp[, 2], obs.counts[, 2],
             model.exp[, 3], obs.counts[, 3])
rownames(tab) <- colnames(ANES)
colnames(tab) <- c("Exp Agree", "Obs Agree",  "Exp Can't Decide", "Obs Can't Decide", "Exp Disagree", "Obs Disagree")
xtable(tab, digits = 1, caption = "Expected responses based on fitted parameters and Observed responses")


## -----------------------------------------------------------------------------
# The chi squared statistic
chi.squared
# p-value
pchisq(chi.squared, df = (J-1) * max(Vj))

## -----------------------------------------------------------------------------
# The estimated quantities can be exctracted from the output model
names(out)
out$alpha

## ----theta, fig.height = 7, fig.width = 7, fig.align = 'center', fig.cap = "Individuals were asked to indicate their level of agreement with 19 opinion based statements. The fitted multinomial response probabilities for each ideology bloc to each of the first 10 statements are displayed. On the horizontal axis, 0 indicates agree, 1 indicates can't decide, and 2 indicates disagree. The estimates from our variational analysis are denoted by the dots, the Gross and Manrique-Vallier results using MCMC are shown by the red X's. Since they do not report estimates for Group 3, the Can't Decide group, those estimates are shown as 0's. The estimates for the first 10 questions are shown."----
plot(out.permute, type = "theta", compare = gmv_theta, varNames = colnames(ANES),
         groupNames = c("Conservative", "Liberal", "Undecided"), nrow = 10, indices = c(1:10))

## ----consprop, fig.width = 4, fig.height = 4, fig.align='center', fig.cap = "Propensity to agree with each opinion-based statement for the conservative bloc"----
pop1VarOrder <- colnames(ANES)[order(out.permute$theta[, 1, 1], decreasing = T)]
pop1VarAgree <- sort(out.permute$theta[, 1, 1], decreasing = T)
barplot(height = pop1VarAgree, names.arg = pop1VarOrder,
        main = "Propensity to Agree",
        cex.names = .7, las = 2, xlab = "Value Statements",
        ylab = expression(paste(theta["j,1,1"])),
        col = ifelse(pop1VarAgree > .5, "forestgreen", "darkred"))

## ----libprop, fig.width = 4, fig.height = 4,fig.align='center',  fig.cap = "Propensity to agree with each opinion-based statement for the liberal bloc"----
pop2VarOrder <- colnames(ANES)[order(out.permute$theta[, 2, 1], decreasing = T)]
pop2VarAgree <- sort(out.permute$theta[, 2, 1], decreasing = T)
barplot(height = pop2VarAgree,
        names.arg = pop2VarOrder, main = "Propensity to Agree",
        cex.names = .7, las = 2, xlab = "Value Statements",
        ylab = expression(paste(theta["j,2,1"])),
        col = ifelse(pop2VarAgree > .5, "forestgreen", "darkred"))

## -----------------------------------------------------------------------------
# Point estimates for lambda
lambda.point <- out.permute$phi / rowSums(out.permute$phi)
# number of individuals which exhibit more than .3 degree of membership
# in the undecided group
sum(lambda.point[, 3] >= .3)
# number of can not decide responses from those with high membership in undecided group
sum(ANES[which(lambda.point[, 3] >= .3),] == 1)

## -----------------------------------------------------------------------------
relativeFrequency <- out.permute$alpha / sum(out.permute$alpha)

## ----echo = FALSE, results = 'asis'-------------------------------------------
table.alpha = rbind(out.permute$alpha, relativeFrequency)
rownames(table.alpha) = c("Estimated Alpha", "Estimate Relative Frequency")
colnames(table.alpha) = c("Conservatives", "Liberals", "Undecided")
xtable(table.alpha, caption = "Variational Estimates of Alpha", digits = 3)

## ----polarizing, fig.width = 4, fig.height = 4, fig.align = 'center', fig.cap = "Using Hellinger distance indicates that the most polarizing statements are IND 5, IND 3, and EQ 7. These were all issues involving opportunity for advancement"----
hellingerDist <- (1/sqrt(2)) * sqrt(rowSums( (sqrt(out.permute$theta[, 1, ]) 
                                          - sqrt(out.permute$theta[, 2, ]))^2))
barplot(sort(hellingerDist, decreasing = T),
        names.arg = colnames(ANES)[order(hellingerDist, decreasing = T)],
        main = "Hellinger Distance",
        cex.names = .7, las = 2, ylab = "Hellinger Distance",
        ylim = c(0, 1))
mtext("Between Conservatives and Liberals")
colnames(ANES)[order(hellingerDist, decreasing = T)][1:3]

## -----------------------------------------------------------------------------
# number of individuals with at least 40% membership in
# both conservative and liberal blocs
sum( (lambda.point[, 1] > .4) & (lambda.point[, 2] > .4))

## ----mems, fig.width = 4, fig.height = 4, fig.align = 'center', fig.cap = "Estimated memberships for first 42 individuals"----
plot(out.permute, type = "membership", indices = c(1:20), nrow = 5, ncol = 4, main = "Estimated Memberships",
     groupNames = c("Cons", "Lib", "Cant Decide"), fitNames = c("mixedMem"))

## ----posteriorMem, fig.cap = "We observe a relatively high rate of intra-individual mixing", fig.width = 5, fig.align = 'center', fig.height = 5----
index <- order(lambda.point[, 1])

# variance of posterior membership in conservative bloc
var.Mem <- out.permute$phi[, 1] * (rowSums(out.permute$phi) -
                                 out.permute$phi[, 1]) /
  (rowSums(out.permute$phi)^2 * (rowSums(out.permute$phi) + 1))
# plot posterior means
plot(sort(lambda.point[, 1]), pch = 19,
     main = "Posterior Membership in Conservative Bloc",
     ylab = "Posterior Membership", xlab = "Individual",
     cex = .8, ylim = c(0, 1))

# marginal distirbution of Dirichlet, is Beta distribution, so we can get posterior CI
# plot posterior 90% CI
ci_up <- qbeta(.975, out.permute$phi[index, 1], rowSums(out.permute$phi[index, c(2:3)]))
ci_low <- qbeta(.025, out.permute$phi[index, 1], rowSums(out.permute$phi[index, c(2:3)]))
segments(x0 = c(1:out$Total), y0 = ci_up, y1 = ci_low, col = "gray", lwd = .3, lty = 1)
legend("bottomright", legend = c("posterior mean", "95% CI"), pch = c(19, NA),
       lty = c(NA, 1), col = c("black", "gray"))