SpeedReader: High Performance Text Analysis

Documented in fightin_words_plot

#' A function that generates plots similar to those in Monroe et al.
#' 'Fightin Words...'.
#'
#' @param feature_selection_object A list object generated by the
#' feature_selection function.
#' @param title A user supplied title for the plot. Defaults to "", in which
#' case a blank title is displayed.
#' @param positive_category The name the user wishes to give to the first
#' category specified when using the feature_selection function. Defaults to
#' "Category 1".
#' @param negative_category The name the user wishes to give to the second
#' category specified when using the feature_selection function. Defaults to
#' "Category 2".
#' @param xlab Defaults to 'Term Frequency', but can be modified as necessary.
#' @param display_top_words Defaults to 20 and controls the number of top terms
#' for each category displayed in the plot.
#' @param display_terms_next_to_points Optional argument, defaults to FALSE. If
#' TRUE, then terms are displayed next to the points corresponding to them on
#' the plot. Can get messy.
#' @param size_terms_by_frequency Optional argument, defualts to FALSE. If TRUE,
#' then when top terms are printed, they are sized in proportion to their
#' frequency.
#' @param right_margin Parameter controling how much space should be reserved
#' for the right margin in the plot (for displaying top terms). Defaults to 20
#' but can be adjusted depending on the length of terms.
#' @param max_terms_to_display Defaults to 100,000. Used to prevent overloading
#' the plotting device with very large vocabularies. Can be set by the user.
#' @param use_subsumed_ngrams Logical indicating whether subsumed ngrams should
#' be used when displaying top terms. This will only work if the user has
#' selected subsume_ngrams  = TRUE in the feature_selection() function (and is
#' using a vocabulary contianing overlapping n-grams).
#' @param limits An optional numeric vector of length two where the first number
#' is the upper x limit (term count) and the second term is the absolute value
#' of the maximum z-score to display (the y limit). Defaults to NULL, in which
#' case the optimal values are automatically determined. Can be useful for
#' comparison between plots.
#' @param clean_publication_plots Logical to remove labels inside of plot and
#' color all dots uniformly. Defaults to FALSE.
#' @param rank_by_log_odds Only applicable for the "informed_Dirichlet" method.
#' Defaults to FALSE. If TRUE, then terms are ranked by log odds instead of z-score.
#' @return A Fightin' Words plot
#' @export
fightin_words_plot <- function(feature_selection_object,
                               title = "",
                               positive_category = "Category 1",
                               negative_category = "Category 2",
                               xlab = "term count",
                               display_top_words = 20,
                               display_terms_next_to_points = FALSE,
                               size_terms_by_frequency = FALSE,
                               right_margin = 20,
                               max_terms_to_display = 100000,
                               use_subsumed_ngrams = FALSE,
                               limits = NULL,
                               clean_publication_plots = FALSE,
                               rank_by_log_odds = FALSE) {
  options(scipen = 999)
  par(mar = c(5.1, 4.1, 4.1, right_margin))
  UMASS_BLUE <- rgb(51, 51, 153, 255, maxColorValue = 255)
  UMASS_RED <- rgb(153, 0, 51, 255, maxColorValue = 255)

  if (class(feature_selection_object) == "list") {
    z_scores <- feature_selection_object[[3]]$z_scores
    zeta <- feature_selection_object[[3]]$scores
    y.tot <- feature_selection_object[[3]]$total_count
    words <- feature_selection_object[[3]]$terms
    if (!is.null(feature_selection_object$rank_by_log_odds)) {
        rank_by_log_odds <- feature_selection_object$rank_by_log_odds
    }
  } else if (class(feature_selection_object) == "data.frame") {
     z_scores <- feature_selection_object$z_scores
    zeta <- feature_selection_object$scores
    y.tot <- feature_selection_object$total_count
    words <- feature_selection_object$terms
  } else {
    stop("You must provide an object generated by the feature_selection function...")
  }

  if (use_subsumed_ngrams) {
      top_words_cat1 <- feature_selection_object$Subsumed_NGrams[[1]]$ranked_term_clusters[,1]
      top_words_cat2 <- feature_selection_object$Subsumed_NGrams[[2]]$ranked_term_clusters[,1]
      words[1:length(top_words_cat1)] <- top_words_cat1
      words[(length(words) - length(top_words_cat2) + 1):length(words)] <- rev(top_words_cat2)
  } else if (rank_by_log_odds) {
      # get teh top ranked terms with a z-score over 1.96 and use them
      z1 <- which(z_scores > 1.96)
      z2 <- which(z_scores < -1.96)
      i1 <- order(zeta[z1],decreasing = TRUE)
      i2 <- order(abs(zeta[z2]),decreasing = TRUE)

      top_words_cat1 <- words[z1[i1]]
      top_words_cat2 <- words[z2[i2]]
      words[1:length(top_words_cat1)] <- top_words_cat1
      words[(length(words) - length(top_words_cat2) + 1):length(words)] <-top_words_cat2
  }

  if (length(zeta) > max_terms_to_display) {
    tot <- length(zeta)
    bound <- floor(max_terms_to_display/2)
    z_scores <- c(z_scores[1:bound], z_scores[(tot-bound+1):tot])
    zeta <- c(zeta[1:bound], zeta[(tot-bound+1):tot])
    y.tot <- c(y.tot[1:bound], y.tot[(tot-bound+1):tot])
    words <- c(words[1:bound], words[(tot-bound+1):tot])
  }

  max_y.tot <- max(y.tot)

  if (clean_publication_plots) {
      max.zeta.one <- which(z_scores > 1.96)
      max.zeta.two <- which(z_scores < -1.96)
  } else {
      max.zeta.one <- which(z_scores > 1.96)[1:display_top_words]
      max.zeta.two <- which(z_scores < -1.96)
      max.zeta.two <- max.zeta.two[(length(max.zeta.two)-display_top_words+1):length(max.zeta.two)]
      max.zeta.two <- rev(max.zeta.two)
  }

  # make sure that we put our top terms in the right places
  if (use_subsumed_ngrams) {
      words[max.zeta.one] <- top_words_cat1[1:display_top_words]
      words[max.zeta.two] <- top_words_cat2[1:display_top_words]
  }

  if (rank_by_log_odds) {
      words[max.zeta.one] <- top_words_cat1[1:display_top_words]
      words[max.zeta.two] <- top_words_cat2[1:display_top_words]
  }

  # determine if the user has specified limits and if so sets them manually
  if (!is.null(limits)) {
      display_limits <- 1.2 * abs(limits[2])
      ylims <- c(-display_limits, display_limits)
      xlims <- c(1, abs(limits[1]))
  } else {
      display_limits <- 1.2 * max(abs(zeta))
      ylims <- c(-display_limits, display_limits)
      xlims <- c(1, 2 * max_y.tot)
  }

  sig.z <- abs(z_scores) > 1.96
  psize <- 2 * abs(zeta)/max(abs(zeta))
  if (rank_by_log_odds) {
      plot(xlims, ylims,
           type = "n", log = "x", pch = 19, col = "black", cex = psize,
           main = title, ylab = "log-odds ratio", xlab = xlab)
  } else {
      plot(xlims, ylims,
           type = "n", log = "x", pch = 19, col = "black", cex = psize,
           main = title, ylab = expression(italic(z)-score), xlab = xlab)
  }

  points(y.tot, zeta, pch = 19, col = "gray", cex = psize)
  points(y.tot[sig.z], zeta[sig.z], pch = 19, col = "black",
    cex = psize[sig.z])

  points(y.tot[max.zeta.one], zeta[max.zeta.one], pch = 19,
         col = UMASS_BLUE, cex = psize[max.zeta.one])
  points(y.tot[max.zeta.two], zeta[max.zeta.two], pch = 19,
         col = UMASS_RED, cex = psize[max.zeta.two])

  if (!clean_publication_plots) {
      if (size_terms_by_frequency) {
          mtext(text = words[max.zeta.one], side = 4, col = UMASS_BLUE,
                las = 1, line = 1, at = seq(0.95 * display_limits,
                                            0.05 * display_limits, length.out = display_top_words),
                cex = psize[max.zeta.one])
          mtext(text = words[max.zeta.two], side = 4, col = UMASS_RED,
                las = 1, line = 1, at = seq(-0.95 * display_limits,
                                            -0.05 * display_limits, length.out = display_top_words),
                cex = psize[max.zeta.two])
      } else {
          mtext(text = words[max.zeta.one], side = 4, col = UMASS_BLUE,
                las = 1, line = 1, at = seq(0.95 * display_limits,
                                            0.05 * display_limits, length.out = display_top_words))
          mtext(text = words[max.zeta.two], side = 4, col = UMASS_RED,
                las = 1, line = 1, at = seq(-0.95 * display_limits,
                                            -0.05 * display_limits, length.out = display_top_words))
      }

      if (display_terms_next_to_points) {
          text(y.tot[max.zeta.one], zeta[max.zeta.one], words[max.zeta.one],
               pch = 19, col = UMASS_BLUE, pos = 4, cex = psize[max.zeta.one])
          text(y.tot[max.zeta.two], zeta[max.zeta.two], words[max.zeta.two],
               pch = 19, col = UMASS_RED, pos = 4, cex = psize[max.zeta.two])
      }

      text(1, 0.9 * display_limits, positive_category, col = UMASS_BLUE,
           pos = 4, cex = 2)
      text(1, -0.9 * display_limits, negative_category, col = UMASS_RED,
           pos = 4, cex = 2)
  }

}