HKCluster: Clustering

# implementation of Latent Dirichlet Allocation using Gibbs sampling
LDA_gibbs = function(iter, rawdocs, K, alpha, beta){
  # iter: number of iterations we want Gibbs sampling to run
  # rawdocs: our input vector of vectors (the whole corpus of words)
  # K: number of topics that needs to be pre-specified
  # alpha, beta: hyperperameters for the two symmetric Dirichlet priors
  
  docs <- strsplit(rawdocs, split=' ', perl=T) # transform our data into a list
  vocab <- unique(unlist(docs)) # extract all the unique words and assign wordIDs
  D = length(docs) # the total number of documents
  V = length(vocab) # the total number of unique words
  for(i in 1:D) docs[[i]] <- match(docs[[i]], vocab) # replace words with wordIDs 
  
  ta <- sapply(docs, function(x) rep(0, length(x))) # initialize topic assignment list
  wt <- matrix(0, K, V) # initialize word-topic count matrix (K X V)
  dt = matrix(0, K, D) # initialize document-topic count matrix (K X D)
  
  # randomly assign topics to each word in the corpus
  for(d in 1:D){ # for each document 
    for(w in 1:length(docs[[d]])){ # for each word in document d
      
      ta[[d]][w] <- sample(1:K, 1) # randomly assign a topic to word w
      ti <- ta[[d]][w] # topic index
      wi <- docs[[d]][w] # wordID for word w
      wt[ti,wi] <- wt[ti,wi]+1 # update word-topic count matrix     
      
    }
  }
  
  # generate document-topic count matrix
  for(d in 1:D){ # for each document d
    for(t in 1:K){ # for each topic t
      dt[t,d] <- sum(ta[[d]]==t) # count words in document d assigned to topic t   
    }
  }
  
  # parameters updating part
  for (i in 1:iter){ # for each iteration
    for (d in 1: D){ # for each document
      for(w in 1:length(docs[[d]])){ # for each word
        
        ta0 <- ta[[d]][w] # initial topic assignment to word w
        wid <- docs[[d]][w] # wordID of word w
        
        dt[ta0,d] <- dt[ta0,d]-1 # don't include word w in document-topic count matrix when sampling for word w
        wt[ta0,wid] <- wt[ta0,wid]-1 # don't include word w in word-topic count matrix when sampling for word w
        
        # conditional prob. of p(z_{i}=j|z_{-i},w_{i},d_{i},...)
        denom_a <- rowSums(wt) + V*beta 
        denom_b <- sum(dt[,d]) + K*alpha 
        p_z <- (wt[,wid] + beta)/denom_a * (dt[,d] + alpha)/denom_b 
        
        # draw a topic for word w from multinomial using probabilities calculated above
        t1 <- sample(1:K, 1, prob=p_z/sum(p_z)) 
        
        # update topic assignment for word w
        ta[[d]][w] <- t1 
        
        # update WT, DT matrices
        dt[t1,d] <- dt[t1,d]+1 # add 1 back to document-topic matrix for word w
        wt[t1,wid] <- wt[t1,wid]+1 # add 1 back to word-topic matrix for word w
        
        # examine when topic assignments change
        if(ta0!=t1) print(paste0('iter:', iter, ' doc:', d, ' word:', w, ' topic:', ta0, ' =>', t1)) 
      } 
    }
  }
}