#' Calculate IMI
#'
#' @description
#' Function to calculate instantanueous mutual information for types for a given topic.
#' See reference for details.
#'
#' @references
#' Mimno, D. and Blei, D. Bayesian Checking for Topic Models
#'
#' @param state a tidy_topic_state object
#' @param g Grouping variable to use to calculate IMI. Default is 'doc'.
#' @param w A \code{\link[tibble]{tibble}} with types and topics to calculate IMI for. Default is NULL, all types.
#'
#' @export
imi <- function(state, g = "doc", w=NULL){
checkmate::assert(is.tidy_topic_state(state))
checkmate::assert_choice(g, names(state))
checkmate::assert_choice(class(state[[g]]), c("integer", "factor", "character"))
checkmate::assert(checkmate::check_class(w, "tbl_df"),
checkmate::check_character(w, null.ok = TRUE))
checkmate::assert(!is.grouped_df(w))
# Cleanup state file
eval(parse(text=paste0("state <- dplyr::transmute(state, doc = as.integer(",g ,"), type, topic)")))
state <- dplyr::transmute(state, topic, doc, type)
# Calculate H(D|k)
HDk <- dplyr::summarise(dplyr::group_by(state, doc, topic), n = n())
HDk <- dplyr::group_by(HDk, topic)
HDk <- dplyr::mutate(HDk, sum_n = sum(n), pmi = log(n/sum_n) * (n/sum_n), sum_n = NULL)
HDk <- dplyr::summarise(HDk, HDk = sum(pmi))
HDk <- dplyr::mutate(HDk, HDk = -1 * HDk)
# Calculate H(D|W=w, k)
imis <- dplyr::summarise(dplyr::group_by(state, doc, type, topic), n = n())
imis <- dplyr::group_by(imis, type, topic)
imis <- dplyr::mutate(imis, sum_n = sum(n), pmi = log(n/sum_n) * (n/sum_n), sum_n = NULL)
imis <- dplyr::summarise(imis, imi = sum(pmi))
imis <- dplyr::left_join(imis, HDk, by = "topic")
imis <- dplyr::mutate(imis, imi = imi + HDk, HDk = NULL)
imis <- dplyr::arrange(imis, topic, desc(imi))
imis <- dplyr::ungroup(imis)
if(!is.null(w)) {
imis <- dplyr::right_join(imis, w, by = c("topic", "type"))
}
imis
}
#' Calculate Mutual information between types and documents (or groups)
#'
#' @description
#' Function to calculate mutual information between types and documents (MI(D,W|k))
#' for a given topic.
#'
#' \code{mi_group()} calculates MI based on a grouping variable g, used for model checking.
#'
#' See reference for details.
#'
#' @references
#' Mimno, D. and Blei, D. Bayesian Checking for Topic Models
#'
#' @param state A topic model state file
#' @param g A character element as a name of a grouping variable in \code{state} (factor or integer). Default is \code{doc}.
#'
#' @export
mi <- function(state, g = "doc"){
checkmate::assert(is.tidy_topic_state(state))
checkmate::assert_choice(g, names(state))
checkmate::assert_choice(class(state[[g]]), c("integer", "factor", "character"))
eval(parse(text=paste0("state <- dplyr::transmute(state, doc = as.integer(", g ,"), type, topic)")))
state <- dplyr::transmute(state, topic, doc, type)
state <- dplyr::group_by(state, doc, type, topic)
state <- dplyr::summarise(state, n = n())
state <- dplyr::ungroup(state)
Ndk <- dplyr::summarise(dplyr::group_by(state, doc, topic), nd = n())
Nwk <- dplyr::summarise(dplyr::group_by(state, type, topic), nw = n())
Nk <- dplyr::summarise(dplyr::group_by(Nwk, topic), nk = sum(nw))
Nwk <- dplyr::left_join(Nwk, Nk, by = c("topic"))
state <- dplyr::group_by(state, topic)
state <- dplyr::inner_join(state, Ndk, by = c("topic", "doc"))
state <- dplyr::inner_join(state, Nwk, by = c("topic", "type"))
state <- dplyr::mutate(state, part_mi = n/nk * log((n * nk)/(nd * nw)))
state <- dplyr::summarise(state, mi = sum(part_mi))
state
}
#' Calculate MI deviance
#'
#' @details
#' When sampling and deviation calculation this can be done using argument state and
#' and iteration number.
#'
#' @param observed a \code{tbl_df} with \code{mi} by \code{topic} based on the observed data.
#' @param replicated a \code{tbl_df} with \code{mi} by \code{topic} based on multiple replications.
#'
#' @export
mi_deviance <- function(observed, replicated){
checkmate::assert_class(observed, "tbl_df")
checkmate::assert_class(replicated, "tbl_df")
checkmate::assert_subset(names(observed), c("topic","mi"))
checkmate::assert_subset(names(replicated), c("topic","mi"))
checkmate::assert_set_equal(unique(observed$topic), unique(replicated$topic))
checkmate::assert(nrow(observed) < nrow(replicated))
dev <- dplyr::group_by(replicated, topic)
dev <- dplyr::summarise(dev, mi_mean = mean(mi), std = stats::sd(mi))
dev <- dplyr::full_join(dev, observed, by = "topic")
dev <- dplyr::transmute(dev,deviance = (mi - mi_mean) / std)
dev
}
#' Create IMI plot
#'
#' @details
#' When sampling and deviation calculation this can be done using argument state and
#' and iteration number.
#'
#' @param observed_imi a \code{tbl_df} with \code{imi}, \code{type} and \code{topic} based on the original data.
#' @param replicated_imi a \code{tbl_df} with \code{imi}, \code{type} and \code{topic} based on multiple replications. Default is NULL, no replications used.
#' @param topic which topic to plot
#'
#' @export
ggplot_imi_type <- function(observed_imi, topic, replicated_imi = NULL){
checkmate::assert_class(observed_imi, "tbl_df")
checkmate::assert_subset(c("topic","type","imi"), names(observed_imi))
checkmate::assert_choice(topic, unique(observed_imi$topic))
if(!is.null(replicated_imi)){
checkmate::assert_class(replicated_imi, "tbl_df")
checkmate::assert_subset(c("topic","type","imi"), names(replicated_imi))
checkmate::assert_set_equal(unique(observed_imi$topic), unique(replicated_imi$topic))
checkmate::assert(nrow(observed_imi) <= nrow(replicated_imi))
}
k <- topic
observed_k <- filter(observed_imi, topic == k)
observed_k$type <- factor(observed_k$type,
levels = rev(as.character(observed_k$type)))
if(!is.null(replicated_imi)){
replicated_k <- filter(replicated_imi, topic == k)
replicated_k$type <- factor(replicated_k$type,
levels = rev(as.character(observed_k$type)))
}
p <- ggplot(data = observed_k, aes(x = type, y = imi))
if(!is.null(replicated_imi)){
p <- p + geom_boxplot(data = replicated_k, aes(x = type, y = imi), color = "gray", outlier.shape = NA) + coord_flip()
}
p <- p + geom_point(shape = 2L) + coord_flip()
p + coord_flip()
}
#' Sample types for a given setup of topic indicators
#'
#' @description
#' The function is sampling new word types based on posterior mode without hyper parameters.
#'
#' This function is used for modeling comparisons using Bayesian checking.
#' See reference for details.
#'
#' @references
#' Mimno, D. and Blei, D. Bayesian Checking for Topic Models
#'
#' @param state a topic model state object
#' @param n The number of random draws of tokens given topic.
#'
#' @return
#' A tidy dataset with simulated tokens.
#'
#' @examples
#' # Load the state of the union topic model
#' data("sotu50")
#' w <- sample_tokens_given_topics(state = sotu50, n = 5)
#'
#' @export
sample_tokens_given_topics <- function(state, n = 1){
checkmate::assert(is.tidy_topic_state(state))
checkmate::assert_number(n, lower = 1)
Nkw <- topic_type_matrix(state)
# Calculate the number of tokens by topic
Nk <- dplyr::group_by(state, topic)
Nk <- dplyr::summarise(Nk, n = n())
Nk <- dplyr::ungroup(Nk)
# Calculate probability and split
pdfs <- split(dplyr::transmute(Nkw, type = as.integer(type), n), Nkw$topic)
w <- dplyr::bind_rows(lapply(1L:n, function(x) {
state$sample <- x
state
}))
w <- dplyr::mutate(w, type = as.integer(type))
for(i in 1:nrow(Nk)){
w[["type"]][state$topic == i] <-
sample(pdfs[[i]]$type, size = Nk$n[i] * n, replace = TRUE, prob = pdfs[[i]]$n)
}
w[["type"]] <- factor(levels(state$type)[w[["type"]]], levels = levels(state$type))
w
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.