# implementation of Latent Dirichlet Allocation using Gibbs sampling
LDA_gibbs = function(iter, rawdocs, K, alpha, beta){
# iter: number of iterations we want Gibbs sampling to run
# rawdocs: our input vector of vectors (the whole corpus of words)
# K: number of topics that needs to be pre-specified
# alpha, beta: hyperperameters for the two symmetric Dirichlet priors
docs <- strsplit(rawdocs, split=' ', perl=T) # transform our data into a list
vocab <- unique(unlist(docs)) # extract all the unique words and assign wordIDs
D = length(docs) # the total number of documents
V = length(vocab) # the total number of unique words
for(i in 1:D) docs[[i]] <- match(docs[[i]], vocab) # replace words with wordIDs
ta <- sapply(docs, function(x) rep(0, length(x))) # initialize topic assignment list
wt <- matrix(0, K, V) # initialize word-topic count matrix (K X V)
dt = matrix(0, K, D) # initialize document-topic count matrix (K X D)
# randomly assign topics to each word in the corpus
for(d in 1:D){ # for each document
for(w in 1:length(docs[[d]])){ # for each word in document d
ta[[d]][w] <- sample(1:K, 1) # randomly assign a topic to word w
ti <- ta[[d]][w] # topic index
wi <- docs[[d]][w] # wordID for word w
wt[ti,wi] <- wt[ti,wi]+1 # update word-topic count matrix
}
}
# generate document-topic count matrix
for(d in 1:D){ # for each document d
for(t in 1:K){ # for each topic t
dt[t,d] <- sum(ta[[d]]==t) # count words in document d assigned to topic t
}
}
# parameters updating part
for (i in 1:iter){ # for each iteration
for (d in 1: D){ # for each document
for(w in 1:length(docs[[d]])){ # for each word
ta0 <- ta[[d]][w] # initial topic assignment to word w
wid <- docs[[d]][w] # wordID of word w
dt[ta0,d] <- dt[ta0,d]-1 # don't include word w in document-topic count matrix when sampling for word w
wt[ta0,wid] <- wt[ta0,wid]-1 # don't include word w in word-topic count matrix when sampling for word w
# conditional prob. of p(z_{i}=j|z_{-i},w_{i},d_{i},...)
denom_a <- rowSums(wt) + V*beta
denom_b <- sum(dt[,d]) + K*alpha
p_z <- (wt[,wid] + beta)/denom_a * (dt[,d] + alpha)/denom_b
# draw a topic for word w from multinomial using probabilities calculated above
t1 <- sample(1:K, 1, prob=p_z/sum(p_z))
# update topic assignment for word w
ta[[d]][w] <- t1
# update WT, DT matrices
dt[t1,d] <- dt[t1,d]+1 # add 1 back to document-topic matrix for word w
wt[t1,wid] <- wt[t1,wid]+1 # add 1 back to word-topic matrix for word w
# examine when topic assignments change
if(ta0!=t1) print(paste0('iter:', iter, ' doc:', d, ' word:', w, ' topic:', ta0, ' =>', t1))
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.