#!/usr/bin/Rscript
# R/num_max.R Author "Nathan Wycoff <nathanbrwycoff@gmail.com>" Date 04.20.2018
## Numerically maximize the posterior with latent vars integrated out.
#' Negative Log Incomplete Posterior
#'
#' The log posterior with the latent class assignments integrated out, evaluated at PHI, THETA, PSI as specified, given docs.
#'
#' @param PHI A K by V matrix with simplex valued rows giving the the probabilty of words (cols) in topics (rows).
#' @param THETA M by P real valued matrix, giving P-d locations of documents.
#' @param PSI K by P real valued matrix, giving P-d locations of topics
#' @param docs A term frequency matrix, that is, one with a row for each document, a column for each vocab word, and integer entries indicating the occurence of a word in a doc.
#' @param eta The exchangible dirichlet prior on words in a topic.
#' @param beta The precision for topic locations, a positive scalar.
#' @param gama The precision for document locations, a positive scalar.
#' @param soft_PHI A boolean, if TRUE, PHI was provided on the logodds scales.
#' @return A scalar, giving the negative log posterior density at the point.
#' @export
nlip <- function(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI) {
if (soft_PHI) {
PHI <- t(apply(cbind(PHI, 0), 1, softmax))
}
## Generate parameters
ll <- 0
# Topic by Word Matrix
for (k in 1:K){
ll <- ll + (eta - 1) * sum(log(PHI[k,]))
}
# Topic Locations
for (k in 1:K) {
for (p in 1:P) {
ll <- ll + dnorm(PSI[k,p],0,1/sqrt(beta), log = TRUE)
}
}
# Doc Locations
for (i in 1:M) {
for (p in 1:P) {
ll <- ll + dnorm(THETA[i,p],0,1/sqrt(gamma), log = TRUE)
}
}
## Transform Parameters
# Get probability of topic in each doc
RHO <- matrix(NA, nrow = M, ncol = K)
for (i in 1:M) {
dists <- sapply(1:K, function(k) sum((THETA[i,] - PSI[k,])^2))
RHO[i,] <- softmax(-0.5 * dists)
}
# Prob of each word in each doc
PI <- RHO %*% PHI
## Generate Data
for (i in 1:M) {
doc <- docs[[i]]
for (w in doc) {
ll <- ll + log(PI[i, w])
}
}
return(-ll)
}
num_nlip_grad <- function(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = FALSE) {
h <- 1e-6
nP <- matrix(NA, nrow = K, ncol = P)
for (k in 1:K) {
for (p in 1:P) {
f <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = soft_PHI)
PSI[k,p] <- PSI[k,p] + h
fp <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = soft_PHI)
nP[k,p] <- (fp - f) / h
}
}
nT <- matrix(NA, nrow = M, ncol = P)
for (m in 1:M) {
for (p in 1:P) {
f <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = soft_PHI)
THETA[m,p] <- THETA[m,p] + h
fp <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = soft_PHI)
nT[m,p] <- (fp - f) / h
}
}
if (soft_PHI) {
nR <- matrix(NA, nrow = K, ncol = V-1)
for (k in 1:K) {
for (v in 1:(V-1)) {
f <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = TRUE)
PHI[k,v] <- PHI[k,v] + h
fp <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = TRUE)
nR[k,v] <- (fp - f) / h
}
}
return(list( grad_THETA = nT, grad_PSI = nP, grad_PHI = nR))
} else {
nH <- matrix(NA, nrow = K, ncol = V)
for (k in 1:K) {
for (v in 1:V) {
f <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = FALSE)
PHI[k,v] <- PHI[k,v] + h
fp <- nlip(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI = FALSE)
nH[k,v] <- (fp - f) / h
}
}
}
return(list( grad_THETA = nT, grad_PSI = nP,grad_PHI = nH))
}
#' The Gradient of the Log Incomplete Posterior
#'
#' this boii accepts a list of docs for right now.
g_nlip <- function(PHI, THETA, PSI, docs, eta, gamma, beta, soft_PHI) {
Ns <- sapply(docs, length)
if (soft_PHI) {
PHI <- t(apply(cbind(PHI, 0), 1, softmax))
}
# Prior Contribution
grad_THETA <- -gamma * THETA
grad_PSI <- -beta * PSI
grad_PHI <- (eta - 1) / PHI
# Get probability of topic in each doc
RHO <- matrix(NA, nrow = M, ncol = K)
for (i in 1:M) {
dists <- sapply(1:K, function(k) sum((THETA[i,] - PSI[k,])^2))
RHO[i,] <- softmax(-0.5 * dists)
}
# Prob of each word in each doc
PI <- RHO %*% PHI
# Gradients for THETA
for (i in 1:M) {
for (j in 1:Ns[i]) {
w <- docs[[i]][j]
for (k in 1:K) {
for (l in 1:K) {
grad_THETA[i,] <- grad_THETA[i,] +
PHI[k, w] / PI[i, w] * RHO[i,k] * (as.numeric(l==k) - RHO[i,l]) * (PSI[l, ] - THETA[i,])
}
}
}
}
#Gradients for PSI
for (i in 1:M) {
for (j in 1:Ns[i]) {
w <- docs[[i]][j]
for (k in 1:K) {
for (l in 1:K) {
grad_PSI[k,] <- grad_PSI[k,] +
PHI[l, w] / PI[i, w] * RHO[i,l] * (as.numeric(l==k) - RHO[i,k]) * (THETA[i,] - PSI[k,])
}
}
}
}
# Gradient for PHI in simplex form.
for (k in 1:K) {
for (i in 1:M) {
for (j in 1:Ns[i]) {
w <- docs[[i]][j]
grad_PHI[k, w] <- grad_PHI[k, w] + RHO[i,k] / PI[i, w]
}
}
}
if (soft_PHI) {
# Propogate to real valued form if desired
grad_GAMM <- matrix(0, nrow = K, ncol = V-1)
for (k in 1:K) {
for (v in 1:(V-1)) {
for (u in 1:V) {
grad_GAMM[k, v] <- grad_GAMM[k,v] +
grad_PHI[k, u] * PHI[k,u] * (as.numeric(u==v) - PHI[k,v])
}
}
}
return(list(grad_THETA = -grad_THETA, grad_PSI = -grad_PSI, grad_PHI = -grad_GAMM))
} else {
return(list(grad_THETA = -grad_THETA, grad_PSI = -grad_PSI, grad_PHI = -grad_PHI))
}
}
#' Numerically Maximize complete Posterior.
#'
#' Numerically maximize the complete posterior of the PLSV model.
#'
#' @param docs A list of integer vectors, each subvector representing a document in the form of the indices of its words in the vocab.
#' @param K The number of topics, an integer scalar.
#' @param V The number of unique words, an integer scalar.
#' @param P The dimensionality of the embedding space, an integer, usually 2.
#' @param eta The exchangible dirichlet prior on words in a topic.
#' @param beta The precision for topic locations, a positive scalar.
#' @param gama The precision for document locations, a positive scalar.
#' @param make_plot A boolean, if TRUE, will make a ggplot visualization of the topics and documents, with topics in red.
#' @param THETA_init A real matrix with as many rows as docs has and P many columns, giving an initial value for THETA.
#' @param PSI_init A real matrix with K many rows and P many columns, giving an initial value for PSI
#' @param PHI_init A matrix of K many V-1-simplex valued rows, giving the initial value for PHI
#' @param THETA_fix A list of lists, used to fix rows of THETA to a given value. Each sublist has two elements: 'ind' and 'val'. 'ind' Indicates the row, 1-index, of THETA to fix, and 'val', a real valued P-vector, indicates the value to fix it to.
#' @return A list containing ests, a list with PHI, the topic by document matrix, THETA, the document locations in P-D space, and PSI, the topic locations in P-D space.
#' @export
num_post_plsv <- function(docs, K, V, P, eta, gamma, beta,
make_plot = FALSE, THETA_init = NULL,
PSI_init = NULL, PHI_init = NULL,
THETA_fix = list(), PSI_fix = list()) {
M <- length(docs)
# Get the indices of fixed params.
THETA_inds <- sapply(THETA_fix, function(i) i$ind)
PSI_inds <- sapply(PSI_fix, function(i) i$ind)
#TODO: This code doesn't work lmao
#if (max(THETA_inds) > M || max(PSI_inds) > K) {
# stop("Check the indices of your fixed values: one of them is larger than the corresponding matrix")
#}
## Maximize the log posterior with respect to the doc and topic locations.
# A vector representation of our three matrices, for use in numerical optimization
# We drop the rows which are fixed here
mat3par <- function(PHI_n, PSI, THETA) {
#Check if there's any fixed vals.
if (length(THETA_inds) > 0) {
THETA_pass <- as.numeric(THETA[-THETA_inds,])
} else {
THETA_pass <- as.numeric(THETA)
}
if (length(PSI_inds) > 0) {
PSI_pass <- as.numeric(PSI[-PSI_inds,])
} else {
PSI_pass <- as.numeric(PSI)
}
#Only otimize the fixed vals.
par <- c(as.numeric(PHI_n),
#as.numeric(PSI[-PSI_inds,]),
PSI_pass,
THETA_pass)
return(par)
}
# Return back to the matrix repr, for use in numerical optimization
par3mat <- function(par, K, V, P) {
#Convert from vectors to matrices
to1 <- K*(V-1)
to2 <- (K - length(PSI_inds))*P
PHI_n <- par[1:to1]
PSI <- par[(to1+1):(to1+to2)]
THETA <- par[(to1+to2+1):(length(par))]
#Recover the matrix form
PHI_n <- matrix(PHI_n, ncol = V-1)
PSI <- matrix(PSI, ncol = P)
THETA <- matrix(THETA, ncol = P)
#Recover the fixed rows.
THETA <- t(sapply(1:M, function(i) {
if (i %in% THETA_inds) {
return(THETA_fix[[which(THETA_inds==i)]]$val)
} else {
ind <- i - sum(THETA_inds < i)
return(THETA[ind,])
}
}))
PSI <- t(sapply(1:K, function(i) {
if (i %in% PSI_inds) {
return(PSI_fix[[which(PSI_inds==i)]]$val)
} else {
ind <- i - sum(PSI_inds < i)
return(PSI[ind,])
}
}))
return(list(PHI_n = PHI_n, THETA = THETA, PSI = PSI))
}
joint_nlip_wrap <- function(par) {
ret <- par3mat(par, K, V, P)
nlipC(ret$PHI_n, ret$THETA, ret$PSI, docs, eta, gamma, beta)
}
Ns <- sapply(docs, length)
gradwrap <- function(par) {
ret <- par3mat(par, K, V, P)
grad <- g_nlipC(ret$PHI_n, ret$THETA, ret$PSI, Ns, docs, eta, gamma, beta)
mat3par(grad$grad_PHI, grad$grad_PSI, grad$grad_THETA)
}
# Do random inits if none were provided
if (is.null(PSI_init)) {
PSI_init <- matrix(rnorm(K*P), ncol = P)
}
if (is.null(THETA_init)) {
THETA_init <- matrix(rnorm(M*P), ncol = P)
}
if (is.null(PHI_init)) {
PHI_n_init <- matrix(rnorm(K*(V-1)), ncol = V-1)
} else {
#Convert users simplex to real valued matrix for optim
PHI_n_init <- t(apply(PHI_init, 1, inv_softmax))[,-V]
}
#TODO: Default inits based on LDA
# Do the actual optimization
fit <- optim(mat3par(PHI_n_init, PSI_init, THETA_init), joint_nlip_wrap,
gr = gradwrap, method = 'BFGS', control = list('maxit' = 1e3))
ests <- par3mat(fit$par, K, V, P)
if (fit$convergence) {
warning("Convergence Failure in num_post_plsv")
}
#Get PHI from PHI_n, with a 0 fixed as the last word for identifiability.
ests$PHI <- t(apply(cbind(ests$PHI_n, 0), 1, softmax))
ests$PHI_n <- NULL
if (make_plot == TRUE) {
comp_scat2d(ests$THETA, ests$PSI)
}
return(list(par = ests, logpost = -fit$value, fit = fit))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.