#' Movie Group Process Function
#'
#' This function implements the Movie Group Process outlined by Ying and Wang in their 2014 paper
#' (A Dirichelt Multinomial Mixture Model-based Approach for Short Text Clustering).
#'
#' @param data A data frame.
#' @param text The name of a column within the data frame containing text to cluster. The column name should
#' not be listed in quotes.
#' @param K The upper limit for the number of topics. The function will automatically condense and remove
#' empty clusters.
#' @param alpha A tuning parameter ranging from 0 to 1 controlling a documents affinity for a larger cluster.
#' Default value is set to 0.1.
#' @param beta A tuning parameter ranging from 0 to 1 controlling a documents affinity for a more similar
#' cluster. Default values is set to 0.1.
#' @param iter The upper limit for the number of iterations to perform. The function will terminate
#' earlier if a stable solution is found. Default is set at 30.
#' @param repeat_words A logical vector indicated whether the documents contain repeated words. If TRUE, the
#' function uses a Algorithm 4 from Yin and Wang's paper; if FALSE, the function using Algorithm 3 from their
#' paper. Default is set to FALSE.
#' @param r_stopwords A logical vector indicating whether stop words should be removed. Default is set at TRUE.
#'
#' @return The function returns the original data frame with an additional column containing cluster assigments
#' for each row of text.
#'
#' @examples
#'
#'
#'
MovieGroupProcess <- function(data, text, K, alpha = 0.1, beta = 0.1, iter = 30, repeat_words = FALSE, r_stopwords = TRUE) {
## Tokenizing data and removing null (i.e. "") values
text <- data[[rlang::quo_name(rlang::enquo(text))]]
text <- strsplit(x = tolower(text), split = "\\W")
text <- rapply(text, function(x) x[x != ""], how = "replace")
## Removing stop words
if (r_stopwords == TRUE) {
text <- lapply(text, setdiff, tidytext::stop_words$word)
}
## creating documents object
documents <- list()
for (i in 1:length(text)) {
documents$document[i] <- i
if (repeat_words == FALSE) {
documents$tokens[i] <- list(unique(text[[i]]))
} else {
documents$tokens[i] <- list(text[[i]])
}
documents$topic[i] <- sample(x = K, size = 1)
}
## creating topics object
topics <- list()
for (i in 1:K) {
topics$documents[i] <- list(documents$document[documents$topic == i])
topics$tokens[i] <- list(unlist(documents$tokens[documents$topic == i]))
}
## GSDMM Alogrithm
for (i in 1:iter) {
old_topics <- documents$topic
for (d in 1:length(documents$document)) {
## Removing document from its cluster and reforming clusters
documents$topic[[d]] <- 0
topics <- list()
for (j in 1:K) {
topics$documents[j] <- list(documents$document[documents$topic == j])
topics$tokens[j] <- list(unlist(documents$tokens[documents$topic == j]))
} ## end of j loop
p <- list()
for (t in 1:K) {
N_1 <- length(topics$documents[[t]]) + alpha
D_1 <- length(documents$document) - 1 + K*alpha
N_2 <- 1
D_2 <- 1
for (w in documents$tokens[[d]]) {
if (repeat_words == FALSE) {
N_2 <- N_2*(sum(w == topics$tokens[[t]]) + beta)
} else {
for (n_w_d in 1:sum(w == documents$tokens[[d]])) {
N_2 <- N_2*(sum(w == topics$tokens[[t]]) + beta)
}
}
}
for (n_d in 1:length(documents$tokens[[d]])) {
D_2 <- D_2*(length(topics$tokens[[t]]) + length(unique(unlist(documents$tokens)))*beta + n_d - 1)
}
p$prob[[t]] <- (N_1/D_1)*(N_2/D_2)
}
## Assigning document to new cluster/topic and reforming clusters/topics
documents$topic[[d]] <- which(unlist(p$prob) == max(unlist(p$prob)))
topics <- list()
for (j in 1:K) {
topics$documents[j] <- list(documents$document[documents$topic == j])
topics$tokens[j] <- list(unlist(documents$tokens[documents$topic == j]))
}
}
if (sum(documents$topic != old_topics) == 0) {
print(paste0("MGP converged after ", i, " iterations."))
break
}
print(paste0("Iteration ", i, " is complete."))
}
## Removing emptpy topics
topics$documents <- topics$documents[which(lengths(topics$tokens) != 0)]
topics$tokens <- topics$tokens[which(lengths(topics$tokens) != 0)]
for (j in 1:length(topics$documents)) {
documents$topic[topics$documents[[j]]] <- j
}
## Adding cluster to orinal data.frame
data$text_cluster <- documents$topic
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.