#' A function that generates plots similar to those in Monroe et al.
#' 'Fightin Words...'.
#'
#' @param feature_selection_object A list object generated by the
#' feature_selection function.
#' @param title A user supplied title for the plot. Defaults to "", in which
#' case a blank title is displayed.
#' @param positive_category The name the user wishes to give to the first
#' category specified when using the feature_selection function. Defaults to
#' "Category 1".
#' @param negative_category The name the user wishes to give to the second
#' category specified when using the feature_selection function. Defaults to
#' "Category 2".
#' @param xlab Defaults to 'Term Frequency', but can be modified as necessary.
#' @param display_top_words Defaults to 20 and controls the number of top terms
#' for each category displayed in the plot.
#' @param display_terms_next_to_points Optional argument, defaults to FALSE. If
#' TRUE, then terms are displayed next to the points corresponding to them on
#' the plot. Can get messy.
#' @param size_terms_by_frequency Optional argument, defualts to FALSE. If TRUE,
#' then when top terms are printed, they are sized in proportion to their
#' frequency.
#' @param right_margin Parameter controling how much space should be reserved
#' for the right margin in the plot (for displaying top terms). Defaults to 20
#' but can be adjusted depending on the length of terms.
#' @param max_terms_to_display Defaults to 100,000. Used to prevent overloading
#' the plotting device with very large vocabularies. Can be set by the user.
#' @param use_subsumed_ngrams Logical indicating whether subsumed ngrams should
#' be used when displaying top terms. This will only work if the user has
#' selected subsume_ngrams = TRUE in the feature_selection() function (and is
#' using a vocabulary contianing overlapping n-grams).
#' @param limits An optional numeric vector of length two where the first number
#' is the upper x limit (term count) and the second term is the absolute value
#' of the maximum z-score to display (the y limit). Defaults to NULL, in which
#' case the optimal values are automatically determined. Can be useful for
#' comparison between plots.
#' @param clean_publication_plots Logical to remove labels inside of plot and
#' color all dots uniformly. Defaults to FALSE.
#' @param rank_by_log_odds Only applicable for the "informed_Dirichlet" method.
#' Defaults to FALSE. If TRUE, then terms are ranked by log odds instead of z-score.
#' @return A Fightin' Words plot
#' @export
fightin_words_plot <- function(feature_selection_object,
title = "",
positive_category = "Category 1",
negative_category = "Category 2",
xlab = "term count",
display_top_words = 20,
display_terms_next_to_points = FALSE,
size_terms_by_frequency = FALSE,
right_margin = 20,
max_terms_to_display = 100000,
use_subsumed_ngrams = FALSE,
limits = NULL,
clean_publication_plots = FALSE,
rank_by_log_odds = FALSE) {
options(scipen = 999)
par(mar = c(5.1, 4.1, 4.1, right_margin))
UMASS_BLUE <- rgb(51, 51, 153, 255, maxColorValue = 255)
UMASS_RED <- rgb(153, 0, 51, 255, maxColorValue = 255)
if (class(feature_selection_object) == "list") {
z_scores <- feature_selection_object[[3]]$z_scores
zeta <- feature_selection_object[[3]]$scores
y.tot <- feature_selection_object[[3]]$total_count
words <- feature_selection_object[[3]]$terms
if (!is.null(feature_selection_object$rank_by_log_odds)) {
rank_by_log_odds <- feature_selection_object$rank_by_log_odds
}
} else if (class(feature_selection_object) == "data.frame") {
z_scores <- feature_selection_object$z_scores
zeta <- feature_selection_object$scores
y.tot <- feature_selection_object$total_count
words <- feature_selection_object$terms
} else {
stop("You must provide an object generated by the feature_selection function...")
}
if (use_subsumed_ngrams) {
top_words_cat1 <- feature_selection_object$Subsumed_NGrams[[1]]$ranked_term_clusters[,1]
top_words_cat2 <- feature_selection_object$Subsumed_NGrams[[2]]$ranked_term_clusters[,1]
words[1:length(top_words_cat1)] <- top_words_cat1
words[(length(words) - length(top_words_cat2) + 1):length(words)] <- rev(top_words_cat2)
} else if (rank_by_log_odds) {
# get teh top ranked terms with a z-score over 1.96 and use them
z1 <- which(z_scores > 1.96)
z2 <- which(z_scores < -1.96)
i1 <- order(zeta[z1],decreasing = TRUE)
i2 <- order(abs(zeta[z2]),decreasing = TRUE)
top_words_cat1 <- words[z1[i1]]
top_words_cat2 <- words[z2[i2]]
words[1:length(top_words_cat1)] <- top_words_cat1
words[(length(words) - length(top_words_cat2) + 1):length(words)] <-top_words_cat2
}
if (length(zeta) > max_terms_to_display) {
tot <- length(zeta)
bound <- floor(max_terms_to_display/2)
z_scores <- c(z_scores[1:bound], z_scores[(tot-bound+1):tot])
zeta <- c(zeta[1:bound], zeta[(tot-bound+1):tot])
y.tot <- c(y.tot[1:bound], y.tot[(tot-bound+1):tot])
words <- c(words[1:bound], words[(tot-bound+1):tot])
}
max_y.tot <- max(y.tot)
if (clean_publication_plots) {
max.zeta.one <- which(z_scores > 1.96)
max.zeta.two <- which(z_scores < -1.96)
} else {
max.zeta.one <- which(z_scores > 1.96)[1:display_top_words]
max.zeta.two <- which(z_scores < -1.96)
max.zeta.two <- max.zeta.two[(length(max.zeta.two)-display_top_words+1):length(max.zeta.two)]
max.zeta.two <- rev(max.zeta.two)
}
# make sure that we put our top terms in the right places
if (use_subsumed_ngrams) {
words[max.zeta.one] <- top_words_cat1[1:display_top_words]
words[max.zeta.two] <- top_words_cat2[1:display_top_words]
}
if (rank_by_log_odds) {
words[max.zeta.one] <- top_words_cat1[1:display_top_words]
words[max.zeta.two] <- top_words_cat2[1:display_top_words]
}
# determine if the user has specified limits and if so sets them manually
if (!is.null(limits)) {
display_limits <- 1.2 * abs(limits[2])
ylims <- c(-display_limits, display_limits)
xlims <- c(1, abs(limits[1]))
} else {
display_limits <- 1.2 * max(abs(zeta))
ylims <- c(-display_limits, display_limits)
xlims <- c(1, 2 * max_y.tot)
}
sig.z <- abs(z_scores) > 1.96
psize <- 2 * abs(zeta)/max(abs(zeta))
if (rank_by_log_odds) {
plot(xlims, ylims,
type = "n", log = "x", pch = 19, col = "black", cex = psize,
main = title, ylab = "log-odds ratio", xlab = xlab)
} else {
plot(xlims, ylims,
type = "n", log = "x", pch = 19, col = "black", cex = psize,
main = title, ylab = expression(italic(z)-score), xlab = xlab)
}
points(y.tot, zeta, pch = 19, col = "gray", cex = psize)
points(y.tot[sig.z], zeta[sig.z], pch = 19, col = "black",
cex = psize[sig.z])
points(y.tot[max.zeta.one], zeta[max.zeta.one], pch = 19,
col = UMASS_BLUE, cex = psize[max.zeta.one])
points(y.tot[max.zeta.two], zeta[max.zeta.two], pch = 19,
col = UMASS_RED, cex = psize[max.zeta.two])
if (!clean_publication_plots) {
if (size_terms_by_frequency) {
mtext(text = words[max.zeta.one], side = 4, col = UMASS_BLUE,
las = 1, line = 1, at = seq(0.95 * display_limits,
0.05 * display_limits, length.out = display_top_words),
cex = psize[max.zeta.one])
mtext(text = words[max.zeta.two], side = 4, col = UMASS_RED,
las = 1, line = 1, at = seq(-0.95 * display_limits,
-0.05 * display_limits, length.out = display_top_words),
cex = psize[max.zeta.two])
} else {
mtext(text = words[max.zeta.one], side = 4, col = UMASS_BLUE,
las = 1, line = 1, at = seq(0.95 * display_limits,
0.05 * display_limits, length.out = display_top_words))
mtext(text = words[max.zeta.two], side = 4, col = UMASS_RED,
las = 1, line = 1, at = seq(-0.95 * display_limits,
-0.05 * display_limits, length.out = display_top_words))
}
if (display_terms_next_to_points) {
text(y.tot[max.zeta.one], zeta[max.zeta.one], words[max.zeta.one],
pch = 19, col = UMASS_BLUE, pos = 4, cex = psize[max.zeta.one])
text(y.tot[max.zeta.two], zeta[max.zeta.two], words[max.zeta.two],
pch = 19, col = UMASS_RED, pos = 4, cex = psize[max.zeta.two])
}
text(1, 0.9 * display_limits, positive_category, col = UMASS_BLUE,
pos = 4, cex = 2)
text(1, -0.9 * display_limits, negative_category, col = UMASS_RED,
pos = 4, cex = 2)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.