R/covars_make_pos.R

Defines functions covars_make_pos.character covars_make_pos.data.frame covars_make_pos.corpus covars_make_pos.snippet covars_make_pos

Documented in covars_make_pos covars_make_pos.character covars_make_pos.corpus covars_make_pos.data.frame covars_make_pos.snippet

#' calculate part-of-speech information from text snippet data
#'
#' Add additional variables consisting of part-of-speech (POS) frequencies to
#' snippets.
#' @param x snippet data from [snippets_make()] consisting of the
#'   fields `text`, `docID`, and `snippetID`
#' @param text_field the name of the text field, if a [data.frame], default
#'   is `"text"`
#' @param normalize if `TRUE`, convert pos tag counts to rates
#' @param ... used to pass the `tagset` argument to
#'   [spacyr::spacy_parse()], for example `tagset = "penn"` to specify the Penn Treebank tagset scheme, instead of the Google
#'   universal tagset default.  See [spacyr::spacy_parse()].
#' @details Note that this requires spaCy to be installed (along with Python).
#'   See the installation instructions at
#'   <http://github.com/kbenoit/spacyr>.
#' @return the data.frame of added variables, consisting of the frequencies of
#'   parts of speech in each text
#' @export
#' @importFrom utils installed.packages
#' @examples
#' \dontrun{
#' # some examples here
#' }
covars_make_pos <- function(x, ...) {
    UseMethod("covars_make_pos")
}

#' @rdname covars_make_pos
#' @export
covars_make_pos.snippet <- function(x, ...) {
    covars_make_pos(x[["text"]], ...)
}

#' @rdname covars_make_pos
#' @export
covars_make_pos.corpus <- function(x, ...) {
    covars_make_pos(as.character(x), ...)
}
#' @rdname covars_make_pos
#' @export
covars_make_pos.data.frame <- function(x,  text_field = "text", ...) {
    covars_make_pos(x[[text_field]], ...)
}

#' @rdname covars_make_pos
#' @param dependency logical; if `TRUE` parse dependencies
#' @importFrom data.table data.table setkey setnames
#' @import spacyr
#' @export
covars_make_pos.character <- function(x, text_field = "text", dependency = TRUE, normalize = TRUE, ...) {
    if (!("spacyr" %in% installed.packages()[, "Package"])) {
        stop("you must first install spacyr to tag parts of speech")
    }
    
    spacy_initialize()
    result <- spacy_parse(x, lemma = FALSE, pos = TRUE, tag = TRUE,
                          dependency = dependency, entity = TRUE)
    orig_docid <- result$doc_id
    
    result <- subset(result, !(pos %in% c("PUNCT", "SPACE")))

    if (!dependency) {
        result$dep_rel <- ""
    }

    doc_id <- sentence_id <- tag <- pos <- dep_rel <- n_namedentities <- `:=` <-
        baseline_year <- .N <- n_sentence <- n_noun <- n_verb <- n_adjective <-
        n_adverb <- n_clause <- NULL

    # count named entities
    ne <- data.table(entity_extract(result))
    ne <- ne[, .N, by = doc_id]
    setnames(ne, "N", "n_namedentities")
    setkey(ne, doc_id)
    result <- data.table(result)
    orig_docid <- result[, unique(doc_id)]
    result_bydoc <- result[, list(max(sentence_id),
                                  sum(pos == "NOUN"),
                                  sum(pos == "VERB"),
                                  sum(pos == "ADJ"),
                                  sum(pos == "ADV"),
                                  sum(stringi::stri_detect_fixed(dep_rel, "cl")),
                                  .N),
                           by = doc_id]
    setnames(result_bydoc,
             c("doc_id", "n_sentence", "n_noun", "n_verb", "n_adjective", "n_adverb", "n_clause", "ntoken"))
    if (normalize) {
        result_bydoc[, c("pr_sentence", "pr_noun", "pr_verb", "pr_adjective", "pr_adverb", "pr_clause") :=
                         list(n_sentence/ntoken, n_noun/ntoken, n_verb/ntoken, n_adjective/ntoken, n_adverb/ntoken, n_clause/ntoken)]
    }
    setkey(result_bydoc, doc_id)

    result_bydoc <- ne[result_bydoc]
    # change missing ne to zero
    result_bydoc[is.na(n_namedentities), n_namedentities := 0]

    if (!dependency) result_bydoc[, c("n_clause", "pr_clause") := NULL]

    # spacyr::spacy_finalize()
    result <- as.data.frame(result_bydoc)[match(result_bydoc[, doc_id], orig_docid), ]
    row.names(result) <- NULL
    result
}
kbenoit/sophistication documentation built on May 12, 2021, 5:57 a.m.