R/textstat_frequency.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Documented in textstat_frequency

#' Tabulate feature frequencies
#'
#' Produces counts and document frequencies summaries of the features in a
#' [dfm][quanteda::dfm], optionally grouped by a [docvars][quanteda::docvars]
#' variable or other supplied grouping variable.
#' @param x a [dfm][quanteda::dfm] object
#' @param n (optional) integer specifying the top `n` features to be returned,
#' within group if `groups` is specified
#' @param ties_method character string specifying how ties are treated.  See
#'   [base::rank()] for details.  Unlike that function, however, the default is
#'   `"min"`, so that frequencies of 10, 10, 11 would be ranked 1, 1, 3.
#' @param ... additional arguments passed to [dfm_group()][quanteda::dfm_group].
#'   This can be useful in passing `force = TRUE`, for instance, if you are
#'   grouping a dfm that has been weighted.
#' @inheritParams quanteda::groups
#' @return a data.frame containing the following variables:
#' \describe{
#' \item{`feature`}{(character) the feature}
#' \item{`frequency`}{count of the feature}
#' \item{`rank`}{rank of the feature, where 1 indicates the greatest
#' frequency}
#' \item{`docfreq`}{document frequency of the feature, as a count (the
#' number of documents in which this feature occurred at least once)}
#' \item{`docfreq`}{document frequency of the feature, as a count}
#' \item{`group`}{(only if `groups` is specified) the label of the group.
#' If the features have been grouped, then all counts, ranks, and document
#' frequencies are within group.  If groups is not specified, the `group`
#' column is omitted from the returned data.frame.}
#' }
#' @examples
#' library("quanteda")
#' set.seed(20)
#' dfmat1 <- dfm(tokens(c("a a b b c d", "a d d d", "a a a")))
#'
#' textstat_frequency(dfmat1)
#' textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "first")
#' textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "average")
#'
#' dfmat2 <- corpus_subset(data_corpus_inaugural, President == "Obama") %>%
#'    tokens(remove_punct = TRUE) %>%
#'    tokens_remove(stopwords("en")) %>%
#'    dfm()
#' tstat1 <- textstat_frequency(dfmat2)
#' head(tstat1, 10)
#'
#' dfmat3 <- head(data_corpus_inaugural) %>%
#'    tokens(remove_punct = TRUE) %>%
#'    tokens_remove(stopwords("en")) %>%
#'    dfm()
#' textstat_frequency(dfmat3, n = 2, groups = President)
#'
#'
#' \dontrun{
#' # plot 20 most frequent words
#' library("ggplot2")
#' ggplot(tstat1[1:20, ], aes(x = reorder(feature, frequency), y = frequency)) +
#'     geom_point() +
#'     coord_flip() +
#'     labs(x = NULL, y = "Frequency")
#'
#' # plot relative frequencies by group
#' dfmat3 <- data_corpus_inaugural %>%
#'     corpus_subset(Year > 2000) %>%
#'     tokens(remove_punct = TRUE) %>%
#'     tokens_remove(stopwords("en")) %>%
#'     dfm() %>%
#'     dfm_group(groups = President) %>%
#'     dfm_weight(scheme = "prop")
#'
#' # calculate relative frequency by president
#' tstat2 <- textstat_frequency(dfmat3, n = 10, groups = President)
#'
#' # plot frequencies
#' ggplot(data = tstat2, aes(x = factor(nrow(tstat2):1), y = frequency)) +
#'     geom_point() +
#'     facet_wrap(~ group, scales = "free") +
#'     coord_flip() +
#'     scale_x_discrete(breaks = nrow(tstat2):1,
#'                        labels = tstat2$feature) +
#'     labs(x = NULL, y = "Relative frequency")
#' }
#' @return `textstat_frequency` returns a data.frame of features and
#'   their term and document frequencies within groups.
#' @export
#' @importFrom quanteda dfm_group
#' @keywords plot
textstat_frequency <- function(x, n = NULL, groups = NULL,
                               ties_method = c("min", "average", "first", "random", "max", "dense"),
                               ...) {
    UseMethod("textstat_frequency")
}

#' @export
textstat_frequency.default <- function(x, n = NULL, groups = NULL,
                               ties_method = c("min", "average", "first", "random", "max", "dense"),
                               ...) {
    stop(friendly_class_undefined_message(class(x), "textstat_frequency"))
}

#' @importFrom stats ave
#' @export
textstat_frequency.dfm <- function(x, n = NULL, groups = NULL,
                               ties_method = c("min", "average", "first", "random", "max", "dense"),
                               ...) {
    ties_method <- match.arg(ties_method)
    x <- as.dfm(x)

    if (!sum(x))
        stop(message_error("dfm_empty"))

    if (missing(groups)) {
        groups <- rep("all", ndoc(x))
    } else {
        field <- deparse(substitute(groups))
        groups <- eval(substitute(groups), get_docvars(x, user = TRUE, system = TRUE), parent.frame())
        if (!field %in% names(get_docvars(x)) || !is.factor(groups))
            field <- NULL
        groups <- as.factor(groups)
    }

    tf <- x
    tf <- dfm_group(tf, groups, ...)
    tf <- as(as(tf, "TsparseMatrix"), "dgTMatrix")

    df <- dfm_weight(x, "boolean", force = TRUE)
    df <- dfm_group(df, groups, ...)
    df <- as(as(df, "TsparseMatrix"), "dgTMatrix")

    result <- data.frame(
        feature = colnames(tf)[tf@j + 1L],
        frequency = tf@x,
        rank = NA,
        docfreq = df@x,
        group = rownames(tf)[tf@i + 1L],
        stringsAsFactors = FALSE
    )

    # get the frequency ranks and sort
    result$rank <- ave(result$frequency, result$group,
                       FUN = function(y) rank(-y, ties.method = ties_method))
    result <- result[order(result$group, result$rank), ]

    # keep only first n items by group, if n is specified
    if (!is.null(n)) {
        result <- do.call(rbind, lapply(split(result, result$group), head, n))
    }

    class(result) <- c("frequency", "textstat", "data.frame")
    rownames(result) <- as.character(seq_len(nrow(result)))
    return(result)
}

Any scripts or data that you put into this service are public.

quanteda.textstats documentation built on Sept. 11, 2024, 6:39 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_frequency.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Defines functions textstat_frequency.dfm textstat_frequency.default textstat_frequency

Documented in textstat_frequency

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_frequency.R In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Defines functions textstat_frequency.dfm textstat_frequency.default textstat_frequency

Documented in textstat_frequency

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_frequency.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data