#' @title Traceplot of log-likelihood.
#'
#' @description Trace plot of the log-likelihood (the x-axis is MCMC iteration, and the y-axis is the log-likelihood).
#' The MCMC algorithm has converged when this plot levels out and the log-likelihood time series plot looks like white noise.
#' If the log-likelihood is still increasing at the last iteration, you should go back and run the algorithm longer (using \code{fitLDA}.)
#'
#' @param data a matrix with log-likelihood values. Each column of the matrix refers to a different MCMC chain.
#' @param zoom should a second plot of the latter iterations be output.
#' @param prop If \code{zoom == TRUE}, this number (between 0 and 1) sets the proportion of final iterations to plot.
#' @param start number of iterations (if any) performed before these chains.
#' @export
#'
#' @examples
#' one.chain <- matrix(rnorm(1000, mean=-500, sd=100), ncol=1) #hopefully our logliklihood is essentially random noise!
#' plotLoglik(one.chain)
#' par(mfrow=c(1,2))
#' plotLoglik(one.chain, zoom=TRUE)
#' three.chains <- matrix(rnorm(3000, mean=-500, sd=100), ncol=3)
#' plotLoglik(three.chains)
#' plotLoglik(three.chains, zoom=TRUE, prop=.1, start=5000)
#'
plotLoglik <- function(data=matrix(), zoom=FALSE, prop=0.5, start=0) {
#par(mfrow=c(1,2))
#on.exit(par(mfrow=c(1, 1)))
# First, a plot of all K series, from first iteration to last:
n <- dim(data)[1]
base <- floor(log(-data[n, 1], base=10))
y.vec <- data[,1]*10^(-base)
rg <- range(y.vec)
plot(1:n, y.vec, las=1, xlab="Iteration", ylab=paste("Log-likelihood (x 10^-", base, ")", sep=""),
ylim=rg, xaxt="n", type="l", col=1, yaxt="n")
ticks <- seq(0, n, length=7)
axis(1, at=ticks, labels=round(ticks)+start)
ticks2 <- seq(rg[1], rg[2], length=6)
axis(2, at=ticks2, labels=round(ticks2), las=1)
K <- dim(data)[2]
if (K > 1) {
for (k in 2:K) {
y.vec <- data[,k]*10^(-base)
lines(1:n, y.vec, col=k)
}
}
# Second, a plot of all K series, from second half of iterations only (to zoom in on this section to assess convergence):
if (zoom) {
second <- floor(n*(1-prop)):n
rg <- range(data[second, ]*10^(-base))
plot(second, data[second, 1]*10^(-base), las=1, xlab="Iteration",
ylab=paste("Log-likelihood (x 10^-", base, ")", sep=""), ylim=rg, xaxt="n", type="l", col=1, yaxt="n")
ticks <- round(seq(n/2, n, length=7))
axis(1, at=ticks, labels=ticks+start)
ticks2 <- seq(rg[1], rg[2], length=6)
axis(2, at=ticks2, labels=round(ticks2), las=2)
if (K > 1){
for (k in 2:K){
lines(second, data[second, k]*10^(-base), col=k)
}
}
}
}
#' @title Compute topic-word and document-topic probability distribution matrices, and re-label topic indices
#'
#' @description This function assumes the ordering of \code{word.id}, \code{doc.id}, \code{topic.id} matters!
#' That is, the first element of \code{word.id} corresponds to the first element of \code{doc.id} which corresponds to the first
#' element of \code{topic.id}. Similarly, the second element of tokens corresponds to the second element of \code{doc.id}
#' which corresponds to the second element of \code{topic.id} (and so on). Also, the ordering of the elements of \code{vocab}
#' are assumed to correspond to the elements of \code{word.id}, so that the first element of \code{vocab} is the token with \code{word.id}
#' equal to 1, the second element of \code{vocab} is the token with \code{word.id} equal to 2, etc.
#'
#' @param word.id a numeric vector with the token id of each token occurrence in the data.
#' @param doc.id a numeric vector containing the document id number of each token occurrence in the data.
#' @param topic.id a numeric vector with a unique value for each topic.
#' @param vocab a character vector of the unique words included in the corpus. The length of this vector should match the max value of \code{word.id}.
#' @param alpha Dirichlet hyperparameter. See \link{fitLDA}.
#' @param beta Dirichlet hyperparameter. See \link{fitLDA}.
#' @param sort.topics Sorting criterion for topics. Supported methods include: "byDocs" to sort topics by the
#' number of documents for which they are the most probable or "byTerms" to sort topics by the number of terms within topic.
#'
#' @return A list of two matrices and one vector. The first matrix is, \code{phi.hat}, contains the distribution over tokens for each topic,
#' where the rows correspond to topics. The second matrix, \code{theta.hat}, contains the distribution over topics for each document, where
#' the rows correspond to documents. The vector returned by the function, \code{topic.id}, is the vector of sampled topics from the LDA fit,
#' with topic indices re-labeled in decreasing order of frequency by the \code{sort.topics} argument.
#' @export
#' @examples
#' data(APinput)
#' #takes a while
#' \dontrun{o <- fitLDA(APinput$word.id, APinput$doc.id)}
#' data(APtopics) #load output instead for demonstration purposes
#' probs <- getProbs(word.id=APinput$word.id, doc.id=APinput$doc.id, topic.id=APtopics$topics,
#' vocab=APinput$vocab)
#' head(probs$phi.hat[,1:5])
#' head(probs$theta.hat)
#'
getProbs <- function(word.id=numeric(), doc.id=numeric(), topic.id=numeric(), vocab=character(),
alpha=0.01, beta=0.01, sort.topics=c("None", "byDocs", "byTerms"), K=integer()) {
# K is number of topics
stopifnot(sort.topics[1] %in% c("None", "byDocs", "byTerms"))
#stopifnot(sort.terms[1] %in% c("none", "freq", "distinct", "saliency"))
if (!all(sort.topics == c("None", "byDocs", "byTerms")) & length(sort.topics) > 1) stop("Please enter only one topic sorting choice")
#if (!all(sort.terms == c("none", "freq", "distinct", "saliency")) & length(sort.terms) > 1) stop("Please enter only one term sorting choice")
N <- length(word.id)
stopifnot(N == length(doc.id), N == length(topic.id))
# compute phi, the matrix of topic-word probability distributions
df <- table(topic.id, word.id)
W <- max(word.id)
D <- max(doc.id)
stopifnot(W == length(vocab))
CTW <- matrix(0, K, W)
CTW[, as.numeric(colnames(df))] <- df
# compute theta, the matrix of document-topic probability distributions
df <- table(doc.id, topic.id)
CDT <- matrix(0, D, K)
CDT[as.numeric(rownames(df)),] <- df
# compute the topic to which the most tokens from each doc are assigned to
main.topic <- max.col(CDT)
main.topic.table <- table(main.topic)
# compute posterior point estimates of phi.hat and theta.hat:
CTW.b <- CTW + beta
phi.hat <- CTW.b/apply(CTW.b, 1, sum)
CDT.a <- CDT + alpha
theta.hat <- CDT.a/apply(CDT.a, 1, sum)
#set relevant names for the two matrices
#rownames(phi.hat) <- rownames(phi.hat, do.NULL=FALSE, prefix= "Topic")
colnames(phi.hat) <- vocab
#colnames(theta.hat) <- colnames(theta.hat, do.NULL=FALSE, prefix= "Topic")
#sort topics (if necessary)
topic.o <- 1:K
if (sort.topics[1] == "byDocs") {
# order the topics by the number of documents for which they are the main topic:
topic.o <- order(main.topic.table, decreasing=TRUE)
main.topic <- match(main.topic, topic.o)
}
if (sort.topics[1] == "byTerms") {
topic.o <- order(apply(CTW, 1, sum), decreasing=TRUE)
main.topic <- match(main.topic, topic.o)
}
phi.hat <- phi.hat[topic.o, ]
theta.hat <- theta.hat[, topic.o]
topic.id <- match(topic.id, topic.o)
#sort terms (if necessary)
if (FALSE) { # put this here to skip this block of code (Kenny - 9/18/13)
term.o <- NULL
if (sort.terms[1] != "none") {
word.tab <- table(word.id)
if (sort.terms[1] == "freq") {
term.o <- order(word.tab, decreasing=TRUE)
} else {
if (sort.terms[1] %in% c("distinct", "saliency")) {
topic.tab <- table(topic.id)
pt <- topic.tab/sum(topic.tab)
t.w <- t(t(phi.hat)/apply(phi.hat, 2, sum)) #P(T|w)
kernel <- t.w*log(t.w/as.vector(pt))
distinct <- apply(kernel, 2, sum)
}
if (sort.terms == "distinct") term.o <- order(distinct, decreasing=TRUE)
if (sort.terms == "saliency") {
pw <- word.tab/sum(word.tab)
saliency <- pw*distinct
term.o <- order(saliency, decreasing=TRUE)
}
}
}
} # End sort terms section
#if (!is.null(term.o)) phi.hat <- phi.hat[,term.o]
#if (sort.topics[1] != "byDocs") main.topic=NULL # now we have main.topic no matter what the sort.topic option
rownames(phi.hat) <- 1:K
colnames(theta.hat) <- 1:K
colnames(phi.hat) <- vocab
return(list(phi.hat=phi.hat, theta.hat=theta.hat, topic.id=topic.id, main.topic=main.topic, topic.order=topic.o))
}
#' @title Compute distinctiveness and saliency of the words in the vocabulary for a given topic model
#'
#' @description This function computes 'distinctiveness' and 'saliency' as defined in Chuang, et al (2012), 'Termite'.
#'
#' @param word.id an integer vector with the token id of each token occurrence in the data.
#' @param topic.id an integer vector with the topic id for each token in the data.
#' @param phi.hat a numeric (K x W) matrix with topic distributions on each of the K rows
#'
#' @return A list of two length-W vectors, the distinctiveness and saliency of each of the W tokens in the vocab
#'
#' @export
#'
token.rank <- function(word.id=integer(), topic.id=integer(), phi.hat=numeric()) {
word.tab <- table(word.id)
topic.tab <- table(topic.id)
pt <- topic.tab/sum(topic.tab)
t.w <- t(t(phi.hat)/apply(phi.hat, 2, sum)) #P(T|w)
kernel <- t.w*log(t.w/as.vector(pt))
distinct <- as.numeric(apply(kernel, 2, sum))
pw <- word.tab/sum(word.tab)
saliency <- as.numeric(pw*distinct)
return(list(distinct=distinct, saliency=saliency))
}
#' @title Plot probable tokens for a given topic
#'
#' @param phi numeric vector with the probability of each token for a given topic.
#' @param vocab character vector of the vocabulary for the corpus
#' @param n.token the number of tokens to plot, where the default is \code{n.token = 20}.
#' @param lambda a parameter between 0 and 1 to control how tokens are ranked within topics
#' @param p the marginal probabilities of the tokens in the vocabulary
#' @param ... additional arguments to the plot() function
#'
#' @details The ranking of tokens within topics is based on a weighted average of the probability of a
#' token (given the topic) and the lift, where the lift of a token is defined as the probability of the token
#' (given the topic) divided by the marginal probability of the token (i.e. across all topics).
#' The ranking that determines the top \code{n.token} tokens to plot is simply
#' \code{lambda * log(p(token)) + (1 - lambda) * log(p(token | topic)/p(token))}.
#'
#' Note: the ordering of \code{phi}, \code{vocab}, and \code{p} must be the same
#' (i.e. the nth element of each vector must correspond to the same token)
#'
#' @export
#'
#' @examples
#' data(APinput)
#' data(APtopics) #load output instead for demonstration
#' probs <- getProbs(word.id=APinput$word.id, doc.id=APinput$doc.id, topic.id=APtopics$topics,
#' vocab=APinput$vocab)
#' #THE ORDERING OF phi, vocab and p MUST MATCH!
#' tokens <- factor(APinput$vocab[APinput$word.id], levels=colnames(probs$phi.hat))
#' token.tab <- table(tokens)
#' p <- token.tab/sum(token.tab)
#' plotTokens(phi=probs$phi.hat[1,], vocab=names(p), n.tokens=30, lambda=1/3, p)
#' # plot all the topics!
#' \dontrun{
#' for (i in seq_along(probs$phi.hat[,1])) {
#' plotWords(probs$phi.hat[i,], tokens)
#' }
#' }
plotTokens <- function(phi=vector(), vocab=character(), n.tokens=20, lambda=0.5, p=vector(), ...) {
# draw figure of the top words for each topic:
if (length(dim(phi)) > 1) {
warning("phi should be a vector. Only the first topic will be used for plotting.")
if (which.max(dim(phi)) == 2) { #guess that max dimension holds the terms
phi <- phi[1,]
} else {
phi <- phi[,1]
}
}
if (length(vocab) == 0) warning("vocab is missing.")
if (length(p) == 0) warning("marginal probability vector for tokens is missing.")
if (length(phi) != length(vocab)) warning("mismatch between length of probability vector for this topic and length of vocab")
if (length(phi) != length(p)) warning("mismatch between length of probability vector for this topic and length of marginal probability vector for all tokens")
#if (length(vocab) != length(p)) warning("mismatch between length of probability vector for this topic and length of vocab")
#vocab <- unique(tokens)
#if (length(vocab) == 0) {
# vocab <- names(phi)
# if (length(vocab) == 0) {
# warning("Vocabulary (names of phi) are missing. Please supply them.")
# }
#}
#N <- length(tokens)
#k <- length(phi)
W <- length(vocab)
# Get overall proportion for each word
#p <- table(factor(tokens, levels=vocab))/N
#p <- p[names(phi)]
#stopifnot(all(names(p) == names(phi))) #order matters!
w <- lambda*log(phi) + (1 - lambda)*log(phi/p)
o <- order(w, decreasing=TRUE)
top <- phi[o][1:n.tokens]
topwords <- vocab[o][1:n.tokens]
topprobs <- as.numeric(top)
#po <- p[o]
#p.vec <- po[names(po) %in% topwords]
p.vec <- p[o][1:n.tokens]
# set up basic plotting variables:
x.vec <- c(0, cumsum(nchar(topwords))[-n.tokens]) + 1:n.tokens - 1
y.vec <- topprobs
yl <- "Probability"
xl <- "Word Rank"
xr <- sum(nchar(topwords)) + n.tokens
midwords <- x.vec + nchar(topwords)/2 - 2
# start the plot:
plot(x.vec, y.vec, type="n", las=1, ylab=yl, xlab=xl, ylim=c(0, max(y.vec, p.vec)), xlim=c(0, xr), xaxt="n", ...)
abline(h=0, col=gray(0.7))
axis(1, at=midwords, labels=1:n.tokens)
# plot the word probabilities within this topic:
for (j in 1:n.tokens) {
for (k in 1:nchar(topwords[j])) {
text(x.vec[j] + k - 1, y.vec[j], substr(topwords[j], k, k), family="mono", pos=2)
}
}
# plot the word probabilities across topics in light gray:
for (j in 1:n.tokens){
for (k in 1:nchar(topwords[j])){
text(x.vec[j] + k - 1, p.vec[j], substr(topwords[j], k, k), family="mono", col=gray(0.7), pos=2)
}
}
legend("topright", inset=0.01, col=c(1, gray(0.7)), legend=c("P(Word | Topic)", "P(Word) across all topics"), lty=1, lwd=2)
}
#' Find representative documents for each topic
#'
#' @param theta Matrix of document-topic probabilities. Could be taken from the output of \link{getProbs}
#' @param docs A character vetor where each element is a document from the corpus.
#' The length should be equal to the first dimension of \code{theta}.
#' @param n The number of documents to return within each topic.
#' @export
#' @examples
#' \dontrun{
#' data(APinput)
#' data(APtopics)
#' data(APcorpus)
#' probs <- getProbs(APinput$word.id, APinput$doc.id, APtopics$topic, APinput$vocab, sort.topics="byTerms")
#' top.docs <- topdocs(probs$theta.hat, APcorpus[APinput$category == 0], n=5)
#' #write the file for uploading to shiny...
#' write.table(top.docs, file=paste0(getwd(), "/top5docs.txt"), sep="\t", row.names=FALSE)
#' #save(top.docs, file="~/LDAtool/data")
#' #sanity check (peaks on plot below should line up on same topic)
#' corpus <- APcorpus[APinput$category == 0]
#' idx <- which(corpus %in% top.docs[,"Topic1"])
#' plot(probs$theta.hat[idx[1],], type="l")
#' for (i in 2:length(idx)) {
#' lines(probs$theta.hat[idx[i],], col=i)
#' }
#' }
topdocs <- function(theta, docs, n=30){
stopifnot(dim(theta)[1] == length(docs))
# indices <- apply(theta, 2, function(x) which(order(x, decreasing=TRUE) <= n))
# df <- data.frame(array(docs[indices], dim=c(n, dim(theta)[2])), stringsAsFactors=FALSE)
# names(df) <- colnames(theta)
# return(df)
k <- dim(theta)[2]
tops <- data.frame(matrix(NA, nrow=n, ncol=k))
for (i in 1:k){
o <- order(theta[,i], decreasing=TRUE)
top <- docs[o][1:n]
tops[,i] <- top
}
names(tops) <- colnames(theta)
return(tops)
}
#' Compute symmetric version of Kullback-Leibler (KL) divergence between two categorical distributions
#'
#' @param x The vector of probabilities in the first distribution
#' @param y The vector of probabilities in the second distribution
#'
#' @export
# symmetric version of kullback-leibler divergence:
KL <- function(x, y) {
#if (length(x) != length(y)) break # must have same support
#if (sum(x) !=1 | sum(y) != 1) break # probabilities must sum to one
#if (sum(x <= 0) > 0 | sum(y <= 0) > 0) break # probabilities must be greater than zero
#if (sum(x >= 1) > 0 | sum(y >= 1) > 0) break # probabilities must be less than one
0.5*sum(x*log(x/y)) + 0.5*sum(y*log(y/x))
}
#' Create a list of required objects from fitted topic model to visualize using d3
#'
#' @param text A character vector of all training documents used to fit the LDA model
#' @param doc.id An integer vector of the document ID numbers for each token occurrence in the data
#' @param word.id An integer vector of the token ID numbers for each token occurrence in the data
#' @param topic.id An integer vector of the topic ID numbers for each token occurrence in the data
#' (from the fitted topic model)
#' @param vocab A character vector of the tokens in the vocabulary
#' @param K The number of topics in the fitted topic model
#' @param k.clusters The number of clusters into which to group the topics
#' @param lambda A number in [0,1] to govern the weighted average that defines a given token's relevance
#' for a given topic. Default to 0.5.
#' @param n.terms The number of terms to display on the right panel of the interactive visualization for each topic
#' @param n.docs The number of example documents to display for each topic
#'
#' @export
#' @examples
#'
#' data(APinput, package="ldatools")
#' names(APinput) # [1] "word.id" "doc.id" "vocab" "category"
#' data(APtopics, package="ldatools")
#' names(APtopics) # [1] "topics" "loglik"
#' data(APcorpus)
#' length(APcorpus) # [1] 2250
#'
#' # select only the documents that were used in training the model (i.e. remove a few very short, or empty docs)
#' text <- APcorpus[APinput$category == 0]
#'
#' # Run the function and write the JSON file:
#' z <- jsviz(text=text, doc.id=APinput$doc.id, word.id=APinput$word.id, topic.id=APtopics$topics, vocab=APinput$vocab,
#' K=30, k.clusters=1, lambda=0.5, n.terms=30, n.docs=10)
#'
#' # Write the list to a JSON object and place in a directory from which to serve the d3 webpage:
#' library(RJSONIO)
#' z.out <- toJSON(z)
#' cat(z.out, file="path/lda.json")
#' # Now serve index.html from path/
jsviz <- function(text=character(), doc.id=integer(), word.id=integer(), topic.id=integer(),
vocab=character(), K=integer(), k.clusters=1, lambda=0.5, n.terms=30, n.docs=10) {
# Set some relevant local variables and run a few basic checks:
N <- length(word.id)
W <- length(vocab)
D <- length(text)
if (D != max(doc.id)) print ("Number of documents not equal to max Document ID")
if (N != length(doc.id)) print ("Number of doc.id elements not equal to number of word.id elements")
if (N != length(topic.id)) print ("Number of topic.id elements not equal to number of word.id elements")
# Get estimated probability matrices theta and phi, and main.topic for each document:
dat <- getProbs(word.id, doc.id, topic.id, vocab, alpha=0.01, beta=0.01, sort.topics="byTerms", K=K)
# re-set topic.id and name it 'topics':
topics <- dat$topic.id
# compute the distinctiveness and saliency of the tokens:
tr <- token.rank(word.id, dat$topic.id, dat$phi.hat)
distinct <- tr$distinct
saliency <- tr$saliency
# set some more variables for the rest of this function:
tokens <- vocab[word.id] # the list of tokens (printed out as actual words)
docs <- doc.id
words.per.doc <- as.numeric(table(doc.id))
# Create a data.frame with the top M documents for each topic (with at least w.min words per document)
# Data.frame will have two columns, first column is labels of the form "Topick" with k=topic id number:
w.min <- 5 # minimum number of words per document for showing exmaple documents
top.docs <- rep("", n.docs*K)
for (k in 1:K){
sel <- dat$main.topic == k & words.per.doc > w.min
o <- order(dat$theta.hat[sel, k], decreasing=TRUE)
top.docs[(k - 1)*n.docs + 1:min(sum(sel), n.docs)] <- text[sel][o][1:min(sum(sel), n.docs)]
}
### 1 ### The first main object to be returned by this function:
doc.df <- data.frame(Category=paste("Topic", rep(1:K, each=n.docs), sep=""), Document=as.character(top.docs), stringsAsFactors=FALSE)
#calculate 'distance' between topics given the top 450 most frequent tokens
avg <- as.numeric(table(word.id))/length(word.id)
phi.hat <- dat$phi.hat
# set distance computation cutoff at minimum number of tokens whose cumulative probability exceeds 80%
o <- order(table(word.id), decreasing=TRUE) # order tokens by overall frequency
dist.cutoff <- min(which(cumsum(avg[o]) > 0.80))
phi <- phi.hat[, o[1:dist.cutoff]]
d <- dist(phi, KL)
fit.cmd <- cmdscale(d, k=2)
x <- fit.cmd[,1]
y <- fit.cmd[,2]
lab <- gsub("Topic", "", names(x))
loc.df <- data.frame(x, y, topics=lab, stringsAsFactors=FALSE)
# compute % of tokens that come from each topic:
p.topics <- table(topics)/length(topics)
topics.df <- data.frame(topics=1:K, Freq=as.numeric(p.topics*100))
# join the MDS location data.frame with the Topic Frequency data.frame
mds.df <- data.frame(loc.df, Freq=topics.df[, "Freq"])
# workaround errors if no clustering is done (ie, input$kmeans == 1)
mds.df$cluster <- 1
centers <- data.frame(x=0, y=0)
if (k.clusters > 1) { # and clustering info (if relevant)
cl <- kmeans(cbind(x, y), centers=k.clusters)
mds.df$cluster <- factor(cl$cluster)
centers <- data.frame(cl$centers)
}
# map tokens to a cluster
frame <- data.frame(tokens, topics, stringsAsFactors=FALSE)
topic.cl.ID <- mds.df[c("topics", "cluster")] #why does subsetting here make the topic -> cluster assignment go haywire???
#framed <- plyr::join(frame, topic.cl.ID, by="topics")
framed <- data.frame(frame, cluster=topic.cl.ID[frame[, "topics"], "cluster"])
# I think the second way is faster.
##############################################################################
### Create a df with the info neccessary to make the default OR new bar chart when selecting a topic or cluster.
### This functionality requires that we keep track of the top input$nTerms within each cluster and topic (as well as overall).
### The ranking of words under a topic is done via a weighted average of the lift and probability of the word given the topic.
### The ranking of words under a cluster is done via a similar weighted average (summing over the relevant topics)
phi.t <- t(dat$phi.hat)
weight <- lambda*log(phi.t) + (1 - lambda)*log(phi.t/avg)
#get the most relevant terms for each topic:
top.terms <- NULL
for (i in 1:K) {
weights <- weight[, i]
o <- order(weights, decreasing=TRUE)
terms <- rownames(phi.t)[o][1:n.terms]
top.terms <- c(top.terms, terms)
}
term.labs <- rep(paste0("Topic", 1:K), each=n.terms)
topic.df <- data.frame(Term=top.terms, Category=term.labs, stringsAsFactors=FALSE)
# get the top terms for each cluster:
t.weights <- as.numeric(table(topics))/N
clust.terms <- NULL
if (k.clusters == 1) {
# if no clustering is done, we don't want to change the 'most informative words' upon hover
clust.terms <- rownames(phi.t)[1:n.terms]
} else {
for (i in 1:k.clusters) {
# grab topics that belong to the current cluster
topicz <- which(topic.cl.ID[, "cluster"] == i)
sub.phi <- phi.t[, topicz]
sub.theta <- dat$theta.hat[, topicz]
#only sum if multiple columns exist
if (!is.null(dim(sub.phi))) {
sub.phi <- apply(t(sub.phi)*t.weights[topicz], 2, sum) # weighted by topic term frequency
sub.theta <- apply(t(sub.theta)*t.weights[topicz], 2, sum) # weighted by topic term frequency
}
weight <- lambda*log(sub.phi) + (1 - lambda)*log(sub.phi/avg)
o <- order(weight, decreasing=TRUE)
terms <- rownames(phi.t)[o][1:n.terms]
clust.terms <- c(clust.terms, terms)
}
}
term.labs <- rep(paste0("Cluster", 1:k.clusters), each=n.terms)
clust.df <- data.frame(Term=clust.terms, Category=term.labs, stringsAsFactors=FALSE)
# Order the terms for the "default" view by decreasing saliency:
top.df <- data.frame(Term=vocab[order(saliency, decreasing=TRUE)][1:n.terms], Category="Default")
all.df <- rbind(topic.df, clust.df, top.df)
all.df$Freq <- 0
# now we have all the top ranking words within each cluster and topic
# next, we find the frequency of those words in each category
all.words <- unique(all.df$Term)
all.frame <- subset(framed, tokens %in% all.words)
counts <- table(as.character(all.frame$tokens), all.frame$topics)
counts2 <- table(as.character(all.frame$tokens), all.frame$cluster)
for (i in 1:K) {
idx <- which(all.df$Category == paste0("Topic", i))
all.df$Freq[idx] <- counts[all.df$Term[idx], i]
}
for (i in 1:k.clusters) {
idx <- which(all.df$Category == paste0("Cluster", i))
all.df$Freq[idx] <- counts2[all.df$Term[idx], i]
}
totals <- table(as.character(all.frame$tokens))
idx <- which(all.df$Category == "Default")
all.df$Freq[idx] <- totals[all.df$Term[idx]]
all.df$Total <- as.integer(totals[all.df$Term])
# relative frequency (in percentages) over topics for each possible term
probs <- t(apply(counts, 1, function(x) round(100*x/sum(x)))) # round() gets closer to 100, although sometimes over
topic.probs <- data.frame(probs, stringsAsFactors=FALSE)
topic.probs$Term <- rownames(probs)
topic.table <- data.frame(Term = rep(rownames(probs), K), Topic=rep(1:K, each=length(all.words)),
Freq = as.numeric(as.matrix(topic.probs[, 1:K])))
return(list(mdsDat=mds.df, mdsDat2=topic.table, barDat=all.df, docDat=doc.df,
centers=centers, nClust=k.clusters))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.