R/textstat_keyness.R

Defines functions keyness_pmi keyness_lr keyness_exact keyness keyness_chi2_stats keyness_chi2_dt textstat_keyness.dfm textstat_keyness.default textstat_keyness

Documented in keyness keyness_chi2_dt keyness_chi2_stats keyness_exact keyness_lr keyness_pmi textstat_keyness

#' Calculate keyness statistics
#'
#' Calculate "keyness", a score for features that occur differentially across
#' different categories.  Here, the categories are defined by reference to a
#' "target" document index in the [dfm], with the reference group
#' consisting of all other documents.
#' @param x a [dfm] containing the features to be examined for keyness
#' @param target the document index (numeric, character or logical) identifying
#'   the document forming the "target" for computing keyness; all other
#'   documents' feature frequencies will be combined for use as a reference
#' @param measure (signed) association measure to be used for computing keyness.
#'   Currently available: `"chi2"`; `"exact"` (Fisher's exact test);
#'   `"lr"` for the likelihood ratio; `"pmi"` for pointwise mutual
#'   information.
#' @param sort logical; if `TRUE` sort features scored in descending order
#'   of the measure, otherwise leave in original feature order
#' @param correction if `"default"`, Yates correction is applied to
#'   `"chi2"`; William's correction is applied to `"lr"`; and no
#'   correction is applied for the `"exact"` and `"pmi"` measures.
#'   Specifying a value other than the default can be used to override the
#'   defaults, for instance to apply the Williams correction to the chi2
#'   measure.  Specifying a correction for the `"exact"` and `"pmi"`
#'   measures has no effect and produces a warning.
#' @references Bondi, M. & Scott, M. (eds) (2010). *Keyness in
#'   Texts*. Amsterdam, Philadelphia: John Benjamins.
#'
#'   Stubbs, M. (2010). Three Concepts of Keywords. In *Keyness in
#'   Texts*, Bondi, M. & Scott, M. (eds): 1--42. Amsterdam, Philadelphia:
#'   John Benjamins.
#'
#'   Scott, M. & Tribble, C. (2006). *Textual Patterns: Keyword and Corpus
#'   Analysis in Language Education*. Amsterdam: Benjamins: 55.
#'
#'   Dunning, T. (1993). [Accurate Methods for the Statistics of Surprise and
#'   Coincidence](https://dl.acm.org/citation.cfm?id=972454). *Computational
#'   Linguistics*, 19(1): 61--74.
#' @return a data.frame of computed statistics and associated p-values, where
#'   the features scored name each row, and the number of occurrences for both
#'   the target and reference groups. For `measure = "chi2"` this is the
#'   chi-squared value, signed positively if the observed value in the target
#'   exceeds its expected value; for `measure = "exact"` this is the
#'   estimate of the odds ratio; for `measure = "lr"` this is the
#'   likelihood ratio \eqn{G2} statistic; for `"pmi"` this is the pointwise
#'   mutual information statistics.
#' @export
#' @return `textstat_keyness` returns a data.frame of features and
#'   their keyness scores and frequency counts.
#' @keywords textstat
#' @importFrom stats chisq.test
#' @examples
#' # compare pre- v. post-war terms using grouping
#' period <- ifelse(docvars(data_corpus_inaugural, "Year") < 1945, "pre-war", "post-war")
#' dfmat1 <- dfm(data_corpus_inaugural, groups = period)
#' head(dfmat1) # make sure 'post-war' is in the first row
#' head(tstat1 <- textstat_keyness(dfmat1), 10)
#' tail(tstat1, 10)
#'
#' # compare pre- v. post-war terms using logical vector
#' dfmat2 <- dfm(data_corpus_inaugural)
#' head(textstat_keyness(dfmat2, docvars(data_corpus_inaugural, "Year") >= 1945), 10)
#'
#' # compare Trump 2017 to other post-war preseidents
#' dfmat3 <- dfm(corpus_subset(data_corpus_inaugural, period == "post-war"))
#' head(textstat_keyness(dfmat3, target = "2017-Trump"), 10)
#'
#' # using the likelihood ratio method
#' head(textstat_keyness(dfm_smooth(dfmat3), measure = "lr", target = "2017-Trump"), 10)
textstat_keyness <- function(x, target = 1L,
                             measure = c("chi2", "exact", "lr", "pmi"),
                             sort = TRUE,
                             correction = c("default", "yates", "williams", "none")) {
    UseMethod("textstat_keyness")
}

#' @export
textstat_keyness.default <- function(x, target = 1L,
                                     measure = c("chi2", "exact", "lr", "pmi"),
                                     sort = TRUE,
                                     correction = c("default", "yates", "williams", "none")) {
    stop(friendly_class_undefined_message(class(x), "textstat_keyness"))
}

#' @export
textstat_keyness.dfm <- function(x, target = 1L,
                                 measure = c("chi2", "exact", "lr", "pmi"),
                                 sort = TRUE,
                                 correction = c("default", "yates", "williams", "none")) {

    x <- as.dfm(x)
    if (!sum(x)) stop(message_error("dfm_empty"))

    measure <- match.arg(measure)
    correction <- match.arg(correction)
    # error checking
    if (ndoc(x) < 2)
        stop("x must have at least two documents")
    if (is.character(target) && !(all(target %in% docnames(x))))
        stop("target not found in docnames(x)")
    if (is.numeric(target) && any(target < 1 | target > ndoc(x)))
        stop("target index outside range of documents")
    if (is.logical(target) && length(target) != ndoc(x))
        stop("logical target value length must equal the number of documents")

    # convert all inputs into logical vector
    if (is.numeric(target)) {
        target <- seq(ndoc(x)) %in% target
    } else if (is.character(target)) {
        target <- docnames(x) %in% target
    } else if (is.logical(target)) {
        target <- target
    } else {
        stop("target must be numeric, character or logical")
    }

    # check if number of target documents < ndoc
    if (sum(target) >= ndoc(x)) {
        stop("target cannot be all the documents")
    }

    # use original docnames only when there are two (different) documents
    if (ndoc(x) == 2 && length(unique(docnames(x))) > 1) {
        label <- docnames(x)[order(target, decreasing = TRUE)]
    } else {
        if (sum(target) == 1 && !is.null(docnames(x)[target])) {
            label <- c(docnames(x)[target], "reference")
        } else {
            label <- c("target", "reference")
        }
    }
    grouping <- factor(target, levels = c(TRUE, FALSE), labels = label)
    temp <- dfm_group(x, groups = grouping)

    if (measure == "chi2") {
        result <- keyness_chi2_dt(temp, correction)
    } else if (measure == "lr") {
        result <- keyness_lr(temp, correction)
    } else if (measure == "exact") {
        if (!correction %in% c("default", "none"))
            warning("correction is always none for measure exact")
        result <- keyness_exact(temp)
    } else if (measure == "pmi") {
        if (!correction %in% c("default", "none"))
            warning("correction is always none for measure pmi")
        result <- keyness_pmi(temp)
    } else {
        stop(measure, " not yet implemented for textstat_keyness")
    }

    if (sort)
        result <- result[order(result[, 2], decreasing = TRUE), ]

    attr(result, "groups") <- docnames(temp)
    class(result) <- c("keyness", "textstat", "data.frame")
    rownames(result) <- as.character(seq_len(nrow(result)))
    return(result)
}

#' Compute keyness (internal functions)
#'
#' Internal function used in textstat_keyness.  Computes \eqn{chi^2} with Yates'
#' continuity correction for 2x2 tables.
#' @name keyness
#' @param x a [dfm] object
#' @details `keyness_chi2_dt` uses vectorized computation from data.table
#' objects.
#' @return a data.frame of chi2 and p-values with rows named for each feature
#' @examples
#' dfmat <- dfm(c(d1 = "a a a b b c c c c c c d e f g h h",
#'                d2 = "a a b c c d d d d e f h"))
#' quanteda.core:::keyness_chi2_dt(dfmat)
#' @keywords textstat internal
#' @importFrom data.table data.table :=
#' @importFrom stats dchisq
#' @references
#'   <https://en.wikipedia.org/wiki/Yates's_correction_for_continuity>
#'
#'
keyness_chi2_dt <- function(x, correction = c("default", "yates", "williams", "none")) {

    correction <- match.arg(correction)
    a <- b <- c <- d <- N <- E <- chi2 <- p <- cor_app <- q <- NULL
    if (ndoc(x) > 2) stop("x can only have 2 rows")
    dt <- data.table(feature = featnames(x),
                     a = as.numeric(x[1, ]),
                     b = as.numeric(x[2, ]))
    dt[, c("c", "d") := list(sum(x[1, ]) - a, sum(x[2, ]) - b)]
    dt[, N := (a + b + c + d)]
    dt[, E := (a + b) * (a + c) / N]

    if (correction == "default" | correction == "yates") {
        dt[, cor_app := (((a + b) * (a + c) / N < 5 | (a + b) * (b + d) / N < 5 |
                              (a + c) * (c + d) / N < 5 | (c + d) * (b + d) / N < 5)
                         & abs(a * d - b * c) >= N / 2)]
        # the correction is usually only recommended if the smallest expected frequency is less than 5
        # the correction should not be applied if |ad - bc| is less than N/2.
        # compute using the direct formula - see link above (adds sign)
        dt[, chi2 := ifelse(cor_app,
                            (N * (abs(a * d - b * c) - N * 0.5) ^ 2) /
                                ((a + b) * (c + d) * (a + c) * (b + d)) * ifelse(a > E, 1, -1),
                            (N * abs(a * d - b * c) ^ 2) /
                                ((a + b) * (c + d) * (a + c) * (b + d)) * ifelse(a > E, 1, -1))]
    } else if (correction == "williams") {
        # William's correction cannot be used if there are any zeros in the table
        # \url{http://influentialpoints.com/Training/g-likelihood_ratio_test.htm}
        dt[, q := ifelse(a * b * c * d == 0, 1,
                         1 + (N / (a + b) + N / (c + d) - 1) * (N / (a + c) + N / (b + d) - 1) / (6 * N))]
        dt[, chi2 := (N * abs(a * d - b * c) ^ 2) / ((a + b) * (c + d) * (a + c) * (b + d)) * ifelse(a > E, 1, -1) / q]
    } else {
        dt[, chi2 := (N * abs(a * d - b * c) ^ 2) / ((a + b) * (c + d) * (a + c) * (b + d)) * ifelse(a > E, 1, -1)]
    }

    # compute p-values
    dt[, p := 1 - stats::pchisq(abs(chi2), 1)]

    data.frame(feature = dt$feature,
               chi2 = dt[, chi2],
               p = dt[, p],
               n_target = as.vector(x[1, ]),
               n_reference = as.vector(x[2, ]),
               stringsAsFactors = FALSE)
}

#' @rdname keyness
#' @importFrom stats chisq.test
#' @importFrom data.table data.table :=
#' @details
#' `keyness_chi2_stats` uses element-by-element application of
#' [chisq.test][stats::chisq.test].
#' @examples
#' quanteda.core:::keyness_chi2_stats(dfmat)
keyness_chi2_stats <- function(x) {

    sums <- rowSums(x)
    result <- as.data.frame(
        do.call(rbind, apply(x, 2, function(y) keyness(as.numeric(y[1]),
                                                              as.numeric(y[2]),
                                                              sums[1], sums[2])))
    )

    data.frame(feature = colnames(x),
               chi2 = result$chi2,
               p = result$p,
               n_target = as.vector(x[1, ]),
               n_reference = as.vector(x[2, ]),
               stringsAsFactors = FALSE)
}

#' @rdname keyness
#' @param t (scalar) frequency of target
#' @param f (scalar) frequency of reference
#' @param sum_t total of all target words
#' @param sum_f total of all reference words
#' @keywords internal
keyness <- function(t, f, sum_t, sum_f) {

    tb <- as.table(rbind(c(t, f), c(sum_t - t, sum_f - f)))
    suppressWarnings(chi <- stats::chisq.test(tb))
    t_exp <- chi$expected[1, 1]
    list(chi2 = unname(chi$statistic) * ifelse(t > t_exp, 1, -1),
         p = unname(chi$p.value))
}

#' @rdname keyness
#' @details
#' `keyness_exact` computes Fisher's exact using element-by-element
#' application of [fisher.test][stats::fisher.test], returning the odds ratio.
#' @importFrom stats fisher.test
#' @examples
#' quanteda.core:::keyness_exact(dfmat)
keyness_exact <- function(x) {
    sums <- rowSums(x)
    result <- as.data.frame(
        do.call(rbind,
                apply(x, 2, function(y) {
                    et <- stats::fisher.test(matrix(c(as.numeric(y),
                                                      as.numeric(sums - y)),
                                                    nrow = 2))
                    data.frame(or = as.numeric(et$estimate), p = et$p.value)
                }))
    )

    data.frame(feature = colnames(x),
               or = result$or,
               p = result$p,
               n_target = as.vector(x[1, ]),
               n_reference = as.vector(x[2, ]),
               stringsAsFactors = FALSE)
}

#' @rdname keyness
#' @param correction implement the Yates correction for 2x2 tables
#' @details `keyness_lr` computes the \eqn{G^2} likelihood ratio statistic
#'   using vectorized computation
#' @examples
#' quanteda.core:::keyness_lr(dfmat)
#' @references
#' <http://influentialpoints.com/Training/g-likelihood_ratio_test.htm>
keyness_lr <- function(x, correction = c("default", "yates", "williams", "none")) {

    correction <- match.arg(correction)
    epsilon <- 0.000000001; # to offset zero cell counts
    a <- b <- c <- d <- N <- E11 <- G2 <- p <- cor_app <- correction_sign <- q <- NULL
    if (ndoc(x) > 2)
        stop("x can only have 2 rows")
    dt <- data.table(feature = featnames(x),
                     a = as.numeric(x[1, ]),
                     b = as.numeric(x[2, ]))
    dt[, c("c", "d") := list(sum(x[1, ]) - a, sum(x[2, ]) - b)]
    dt[, N := (a + b + c + d)]
    dt[, E11 := (a + b) * (a + c) / N]

    if (correction == "default" | correction == "yates") {

        dt[, cor_app := (((a + b) * (a + c) / N < 5 | (a + b) * (b + d) / N < 5 |
                              (a + c) * (c + d) / N < 5 | (c + d) * (b + d) / N < 5)
                         & abs(a * d - b * c) > N / 2)]
        # the correction is usually only recommended if the smallest expected frequency is less than 5
        # the correction should not be applied if |ad - bc| is less than N/2.
        # implement Yates continuity correction
        # If (ad-bc) is positive, subtract 0.5 from a and d and add 0.5 to b and c.
        # If (ad-bc) is negative, add 0.5 to a and d and subtract 0.5 from b and c.
        dt[, correction_sign := a * d - b * c > 0]
        dt[, c("a", "d", "b", "c") := list(a + ifelse(cor_app, ifelse(correction_sign, -0.5, 0.5), 0),
                                           d + ifelse(cor_app, ifelse(correction_sign, -0.5, 0.5), 0),
                                           b + ifelse(cor_app, ifelse(correction_sign, 0.5, -0.5), 0),
                                           c + ifelse(cor_app, ifelse(correction_sign, 0.5, -0.5), 0))]
    }

    dt[, G2 := (2 * (a * log(a / E11 + epsilon) +
                         b * log(b / ((a + b) * (b + d) / N) + epsilon) +
                         c * log(c / ((a + c) * (c + d) / N) + epsilon) +
                         d * log(d / ((b + d) * (c + d) / N) + epsilon))) *
           ifelse(a > E11, 1, -1)]

    if (correction == "williams") {
        # William's correction cannot be used if there are any zeros in the table
        # \url{http://influentialpoints.com/Training/g-likelihood_ratio_test.htm}
        dt[, q := ifelse(a * b * c * d == 0,
                         1,
                         1 + (N / (a + b) + N / (c + d) - 1) * (N / (a + c) + N / (b + d) - 1) / (6 * N))]
        dt[, G2 := G2 / q]
    }

    # compute p-values
    dt[, p := 1 - stats::pchisq(abs(G2), 1)]

    data.frame(feature = dt$feature,
               G2 = dt[, G2],
               p = dt[, p],
               n_target = as.vector(x[1, ]),
               n_reference = as.vector(x[2, ]),
               stringsAsFactors = FALSE)
}

#' @rdname keyness
#' @details `keyness_pmi` computes the Pointwise Mutual Information stat
#'   using vectorized computation
#' @examples
#' quanteda.core:::keyness_pmi(dfmat)
keyness_pmi <- function(x) {

    a <- b <- c <- d <- N <- E11 <- pmi <- p <- NULL
    if (ndoc(x) > 2)
        stop("x can only have 2 rows")
    dt <- data.table(feature = featnames(x),
                     a = as.numeric(x[1, ]),
                     b = as.numeric(x[2, ]))
    dt[, c("c", "d") := list(sum(x[1, ]) - a, sum(x[2, ]) - b)]
    dt[, N := (a + b + c + d)]
    dt[, E11 := (a + b) * (a + c) / N]
    epsilon <- .000000001  # to offset zero cell counts
    dt[, pmi :=   log(a / E11 + epsilon)]

    #normalized pmi
    #dt[, pmi :=   log(a  / E11) * ifelse(a > E11, 1, -1)/(-log(a/N)) ]

    # compute p-values
    dt[, p := 1 - stats::pchisq(abs(pmi), 1)]

    data.frame(feature = dt$feature,
               pmi = dt[, pmi],
               p = dt[, p],
               n_target = as.vector(x[1, ]),
               n_reference = as.vector(x[2, ]),
               stringsAsFactors = FALSE)
}
koheiw/quanteda.core documentation built on Sept. 21, 2020, 3:44 p.m.