R/textplot_keyness.R
In quanteda.textplots: Plots for the Quantitative Analysis of Textual Data

Documented in textplot_keyness

#' Plot word keyness
#'
#' Plot the results of a "keyword" of features comparing their differential
#' associations with a target and a reference group, after calculating keyness
#' using [quanteda.textstats::textstat_keyness()].
#' @param x a return object from [quanteda.textstats::textstat_keyness()]
#' @param show_reference logical; if `TRUE`, show key reference features in
#'   addition to key target features
#' @param show_legend logical; if `TRUE`, show legend
#' @param n integer; number of features to plot
#' @param min_count numeric; minimum total count of feature across the target
#'   and reference categories, for a feature to be included in the plot
#' @param margin numeric; size of margin where feature labels are shown
#' @param color character or integer; colours of bars for target and reference
#'   documents. `color` must have two elements when `show_reference = TRUE`.
#'   See \link[ggplot2:aes_colour_fill_alpha]{ggplot2::color}.
#' @param labelcolor character; color of feature labels.
#' @param labelsize numeric; size of feature labels and bars.  See
#'   \link[ggplot2:aes_linetype_size_shape]{ggplot2::size}.
#' @param font character; font-family of texts. Use default font if `NULL`.
#' @return a \pkg{ggplot2} object
#' @export
#' @author Haiyan Wang and Kohei Watanabe
#' @seealso [quanteda.textstats::textstat_keyness()]
#' @keywords textplot
#' @examples
#' \dontrun{
#' library("quanteda")
#' # compare Trump speeches to other Presidents by chi^2
#' dfmat1 <- data_corpus_inaugural |>
#'      corpus_subset(Year > 1980) |>
#'      tokens(remove_punct = TRUE) |>
#'      tokens_remove(stopwords("en")) |>
#'      dfm()
#' dfmat1 <- dfm_group(dfmat1, groups = dfmat1$President)
#' tstat1 <- quanteda.textstats::textstat_keyness(dfmat1, target = "Trump")
#' textplot_keyness(tstat1, margin = 0.2, n = 10)
#' tstat1 <- quanteda.textstats::textstat_keyness(dfmat1, target = "Trump")
#' textplot_keyness(tstat1, margin = 0.2, n = 10)
#'
#' # compare contemporary Democrats v. Republicans
#' corp <- data_corpus_inaugural |>
#'     corpus_subset(Year > 1960)
#' corp$party <- ifelse(docvars(corp, "President") %in% c("Nixon", "Reagan", "Bush", "Trump"),
#'                      "Republican", "Democrat")
#' dfmat2 <- corp |>
#'     tokens(remove_punct = TRUE) |>
#'     tokens_remove(stopwords("en")) |>
#'     dfm()
#' tstat2 <- quanteda.textstats::textstat_keyness(dfm_group(dfmat2, groups = dfmat2$party),
#'                                                target = "Democrat", measure = "lr")
#' textplot_keyness(tstat2, color = c("blue", "red"), n = 10)
#' }
textplot_keyness <-  function(x, show_reference = TRUE, show_legend = TRUE,
                              n = 20L, min_count = 2L, margin = 0.05,
                              color = c("darkblue", "gray"), labelcolor = "gray30",
                              labelsize = 4, font = NULL) {
    UseMethod("textplot_keyness")
}

#' @export
textplot_keyness.default <- function(x, show_reference = TRUE, show_legend = TRUE,
                                     n = 20L, min_count = 2L, margin = 0.05,
                                     color = c("darkblue", "gray"), labelcolor = "gray30",
                                     labelsize = 4, font = NULL) {
    stop(friendly_class_undefined_message(class(x), "textplot_keyness"))
}

#' @import ggplot2
#' @importFrom utils head tail
#' @export
textplot_keyness.keyness <- function(x, show_reference = TRUE, show_legend = TRUE,
                                     n = 20L, min_count = 2L, margin = 0.05,
                                     color = c("darkblue", "gray"), labelcolor = "gray30",
                                     labelsize = 4, font = NULL) {

    font <- check_font(font)
    color <- as.factor(color)
    if (show_reference) {
        if (length(color) > 2) {
            color <- color[1:2]
        } else if (length(color) == 1) {
            color <- rep(color, 2)
        }
    }

    # extract attribute befor subsetting
    measure <- colnames(x)[2]

    # drop infrequent words
    data <- x[(x$n_target + x$n_reference) >= min_count, , drop = FALSE]

    if (nrow(data) < 1)
        stop("Too few words in the documents.")

    data$keyness <- data[[2]]
    data$right <- data$keyness >= 0
    if (show_reference) {
        t <- intersect(which(data$right), head(seq(nrow(data)), n))
        r <- intersect(which(!data$right), tail(seq(nrow(data)), n))
        i <- union(t, r)
    } else {
        i <- intersect(which(data$right), head(seq(nrow(data)), n))
    }
    data <- data[i, , drop = FALSE]
    if (show_reference) {
        group <- attr(x, "groups")
        data$color <- color[2 - data$right]
    } else {
        data$color <- color[1]
        color <- group <- NULL
    }

    data$x1 <- ifelse(data$right, abs(data$keyness), abs(data$keyness) * -1)
    data$y1 <- rank(data$keyness, ties.method = "first")
    data$x2 <- 0
    data$y2 <- data$y
    margin <- margin * max(abs(data$x1)) * 2

    x1 <- y1 <- x2 <- y2 <- feature <- NULL
    ggplot(data) +
         xlim(if (show_reference) min(data$x1) - margin else 0, max(data$x1) + margin) +
         geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2, color = color),
                      size = labelsize) +
         scale_colour_identity(NULL, labels = group, breaks = color,
                               guide = if (show_legend) "legend" else FALSE) +
         xlab(measure) +
         geom_label(aes(x = x1, y = y1, label = feature), label.size = NA, fill = NA,
                    vjust = "center", hjust = ifelse(data$right, "left", "right"),
                    color = labelcolor, size = labelsize, family = font) +
         theme_bw(base_family = font) +
         theme(axis.line = element_blank(),
               axis.title.y = element_blank(),
               axis.text.y = element_blank(),
               axis.ticks.y = element_blank(),
               plot.background = element_blank(),
               panel.grid.major.y = element_line(linetype = "dotted"))
}

Any scripts or data that you put into this service are public.

quanteda.textplots documentation built on Sept. 11, 2024, 5:35 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

quanteda.textplots
Plots for the Quantitative Analysis of Textual Data

R/textplot_keyness.R
In quanteda.textplots: Plots for the Quantitative Analysis of Textual Data

Defines functions textplot_keyness.keyness textplot_keyness.default textplot_keyness

Documented in textplot_keyness

Try the quanteda.textplots package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textplots Plots for the Quantitative Analysis of Textual Data

R/textplot_keyness.R In quanteda.textplots: Plots for the Quantitative Analysis of Textual Data

Defines functions textplot_keyness.keyness textplot_keyness.default textplot_keyness

Documented in textplot_keyness

Try the quanteda.textplots package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textplots
Plots for the Quantitative Analysis of Textual Data

R/textplot_keyness.R
In quanteda.textplots: Plots for the Quantitative Analysis of Textual Data