R/covars_make.R

Defines functions covars_make_all covars_make.character covars_make.corpus covars_make.data.frame covars_make.snippet covars_make

Documented in covars_make covars_make_all covars_make.character covars_make.corpus covars_make.data.frame covars_make.snippet

#' compute text-based variables from text or snippet data
#'
#' Compute additional variables to snippet data created by
#' [snippets_make()].  These are based on tokens, types, and various
#' readability measures.
#' @param x snippet data from [snippets_make()] consisting of the
#'   fields `text`, `docID`, and `snippetID`
#' @param readability_measure additional readability measures passed through in
#'   the `measure` argument passed to
#'   [quanteda.textstats::textstat_readability]. Because our standard input will be
#'   constituent elements rather than indexes, this defaults to `NULL`
#'   indicating that no compound measures will be used.
#' @param text_field the name of the text field, if a [data.frame], default
#'   is `"text"`
#' @param normalize if `TRUE`, return proportions of words/sentences as
#'   appropriate, instead of raw counts
#' @param ... arguments passed through to `covars_make_character`
#' @return the data.frame of snippets `x` with added variables.  Note:
#'
#' `W_wl.Dale.Chall` is the proportion of words *not* in the Dale-Chall word list.
#'
#' @import stringi
#' @importFrom data.table data.table
#' @import quanteda
#' @export
covars_make <- function(x, ...) {
    UseMethod("covars_make")
}

#' @rdname covars_make
#' @export
covars_make.snippet <- function(x, ...) {
    covars_make(x[["text"]], ...)
}

#' @rdname covars_make
#' @export
covars_make.data.frame <- function(x, text_field = "text", ...) {
    covars_make(x[[text_field]], ...)
}

#' @rdname covars_make
#' @export
covars_make.corpus <- function(x, ...) {
    covars_make(as.character(x), ...)
}

#' @rdname covars_make
#' @importFrom quanteda.textstats textstat_readability
#' @export
covars_make.character <- function(x, readability_measure = NULL, normalize = TRUE, ...) {
    # include Dale.Chall?
    dc <- any(grepl("^Dale\\.Chall", readability_measure))
    # always include some measures
    readability_measure <- unique(c("meanSentenceLength",
                                    "meanWordSyllables", 
                                    "Dale.Chall.old", readability_measure))
    # return the data frame plus the computed variables
    result <- textstat_readability(x, measure = readability_measure, intermediate = TRUE)
    # eliminate Dale.Chall.old if was not in readability_measure
    if (!dc) result[["Dale.Chall.old"]] <- NULL
    # remove document field
    result[["document"]] <- NULL
    # normalize result if needed
    if (normalize) {
        result[["meanWordChars"]] <- result[["C"]] / result[["W"]]
        result[["meanSentenceChars"]] <- result[["C"]] / result[["St"]]
        result[["meanSentenceSyllables"]] <- result[["Sy"]] / result[["St"]]
        result[, c("W3Sy", "W2Sy", "W_1Sy", "W6C", "W7C", "W_wl.Dale.Chall", "Wlt3Sy")] <-
            result[, c("W3Sy", "W2Sy", "W_1Sy", "W6C", "W7C", "W_wl.Dale.Chall", "Wlt3Sy")] / result$W
        # result$W <- result$St <- result$C <- result$Sy <- NULL
    }
    result
}

#' @rdname covars_make
#' @param dependency logical; if `TRUE` parse dependencies
#' @param verbose logical; if `TRUE` print status messages
#' @details `covars_make_all` calls `covars_make`,
#'   [covars_make_baselines()], and [covars_make_pos()],
#'   returning them as a data.frame.
#' @importFrom data.table data.table setkey
#' @export
covars_make_all <- function(x, ..., dependency = TRUE, verbose = FALSE) {
    `:=` <- doc_id <- NULL

    if (verbose) message("   ...computing readability statistics")
    covars1 <- covars_make(x, ...)
    if (verbose) message("   ...computing frequency baseline statistics")
    covars2 <- covars_make_baselines(x)
    if (verbose) message("   ...computing part-of-speech measures")
    covars3 <- covars_make_pos(x, dependency = dependency)

    namestmp <- row.names(covars1)
    covars1 <- data.table(covars1)
    covars1[, doc_id := namestmp]

    namestmp <- row.names(covars2)
    covars2 <- data.table(covars2)
    covars2[, doc_id := paste0("text", namestmp)]

    covars3 <- data.table(covars3)

    setkey(covars1, doc_id)
    setkey(covars2, doc_id)
    setkey(covars3, doc_id)
    covarsall <- covars1[covars2[covars3]]

    as.data.frame(covarsall)
}
kbenoit/sophistication documentation built on May 12, 2021, 5:57 a.m.