#' Count Fixed Tokens
#'
#' Count the occurrence of tokens within a vector of strings. This function
#' differs from \code{\link[termco]{term_count}} in that \code{term_count} is
#' regex based, allowing for fuzzy matching. This function only searches for
#' lower cased tokens (words, number sequences, or punctuation). This counting
#' function is faster but less flexible.
#'
#' @param text.var The text string variable.
#' @param grouping.var The grouping variable(s). Default \code{NULL} generates
#' one word list for all text. Also takes a single grouping variable or a list
#' of 1 or more grouping variables. If \code{TRUE} an \code{id} variable is
#' used with a \code{seq_along} the \code{text.var}.
#' @param token.list A list of named character vectors of tokens. Search will
#' combine the counts for tokens supplied that are in the same vector. Tokens
#' are defined as \code{"^([a-z' ]+|[0-9.]+|[[:punct:]]+)$"} and should
#' conform to this standard. `code{token_count} can be used in a hierarchical
#' fashion as well; that is a list of tokens that can be passed and counted and
#' then a second (or more) pass can be taken with a new set of tokens on only
#' those rows/text elements that were left untagged (count
#' \code{\link[base]{rowSums}} is zero). This is accomplished by passing
#' a \code{\link[base]{list}} of \code{\link[base]{list}}s of search tokens.
#' See \bold{Examples} for the \strong{hierarchical tokens} section for a
#' demonstration.
#' @param stem logical. If \code{TRUE} the search is done after the terms have
#' been stemmed.
#' @param keep.punctuation logical. If \code{TRUE} the punctuation marks are
#' considered as tokens.
#' @param pretty logical. If \code{TRUE} pretty printing is used. Pretty
#' printing can be turned off globally by setting
#' \code{options(termco_pretty = FALSE)}.
#' @param group.names A vector of names that corresponds to group. Generally
#' for internal use.
#' @param meta.sep A character separator (or character vector of separators) to
#' break up the term list names (tags) into that will generate an merge table
#' attribute on the output that has the supplied tags and meta + sub tags as
#' dictated by the separator breaks.
#' @param meta.names A vector of names corresponding to the meta tags generated
#' by \code{meta.sep}.
#' @param \ldots Other arguments passed to \code{\link[gofastr]{q_dtm}}.
#' @return Returns a tibble object of term counts by
#' grouping variable. Has all of the same features as a \code{term_count}
#' object, meaning functions that work on a \code{term_count} object will
#' operate on a a \code{token_count} object as well.
#' @export
#' @examples
#' token_list <- list(
#' person = c('sam', 'i'),
#' place = c('here', 'house'),
#' thing = c('boat', 'fox', 'rain', 'mouse', 'box', 'eggs', 'ham'),
#' no_like = c('not like')
#' )
#'
#' token_count(sam_i_am, grouping.var = TRUE, token.list = token_list)
#' token_count(sam_i_am, grouping.var = NULL, token.list = token_list)
#'
#' \dontrun{
#' x <- presidential_debates_2012[["dialogue"]]
#'
#' bigrams <- frequent_ngrams(x, gram.length = 2)$collocation
#' bigram_model <- token_count(x, TRUE, token.list = as_term_list(bigrams))
#' as_dtm(bigram_model)
#'
#' if (!require("pacman")) install.packages("pacman")
#' pacman::p_load(tidyverse, lexicon, textshape)
#'
#' token_list <- lexicon::nrc_emotions %>%
#' textshape::column_to_rownames() %>%
#' t() %>%
#' textshape::as_list()
#'
#' presidential_debates_2012 %>%
#' with(token_count(dialogue, TRUE, token_list))
#'
#' presidential_debates_2012 %>%
#' with(token_count(dialogue, list(person, time), token_list))
#'
#' presidential_debates_2012 %>%
#' with(token_count(dialogue, list(person, time), token_list)) %>%
#' plot()
#' }
#'
#' ## hierarchical tokens
#' token_list <- list(
#' list(
#' person = c('sam', 'i')
#' ),
#' list(
#' place = c('here', 'house'),
#' thing = c('boat', 'fox', 'rain', 'mouse', 'box', 'eggs', 'ham')
#' ),
#' list(
#' no_like = c('not like'),
#' thing = c('train', 'goat')
#' )
#' )
#'
#' (x <- token_count(sam_i_am, grouping.var = TRUE, token.list = token_list))
#' attributes(x)[['pre_collapse_coverage']]
#'
#' ## External Dictionaries
#' \dontrun{
#' ## dictionary from quanteda
#' require(quanteda); require(stringi); require(textreadr)
#'
#' ## Nadra Pencle and Irina Malaescu (2016) What's in the Words? Development and Validation of a
#' ## Multidimensional Dictionary for CSR and Application Using Prospectuses. Journal of Emerging
#' ## Technologies in Accounting: Fall 2016, Vol. 13, No. 2, pp. 109-127.
#'
#' dict_corporate_social_responsibility <- "https://provalisresearch.com/Download/CSR.zip" %>%
#' textreadr::download() %>%
#' unzip(exdir = tempdir()) %>%
#' `[`(1) %>%
#' dictionary(file = .)
#'
#' csr <- dict_corporate_social_responsibility %>%
#' as_term_list() %>%
#' lapply(function(x){
#' x %>%
#' stringi::stri_replace_all_fixed('_', ' ') %>%
#' stringi::stri_replace_all_regex('\\s*\\(.+?\\)', '') %>%
#' stringi::stri_replace_all_regex('[^ -~]', "'")
#' })
#'
#' presidential_debates_2012 %>%
#' with(token_count(dialogue, list(time, person), csr)) %>%
#' plot()
#'
#' }
token_count <- function(text.var, grouping.var = NULL, token.list, stem = FALSE,
keep.punctuation = TRUE, pretty = ifelse(isTRUE(grouping.var), FALSE, TRUE),
group.names, meta.sep = '__', meta.names = c('meta'), ...) {
amodel <- FALSE
auto_map <- FALSE
## check for hierarchical terms
token.list <- warn_unnest(token.list, type = 'token')
list_list <- FALSE
if (is.list(token.list[[1]]) && length(token.list) > 1 && all(sapply(token.list, is.list))) list_list <- TRUE
if (!list_list) {
lens <- unlist(lapply(token.list, length))
token.list <- lapply(token.list, unique)
lens2 <- unlist(lapply(token.list, length))
} else {
lens <- unlist(lapply(unlist(token.list, recursive = FALSE), length))
token.list <- lapply(token.list, function(x) lapply(x, unique))
lens2 <- unlist(lapply(unlist(token.list, recursive = FALSE), length))
}
if (sum(lens) != sum(lens2)) warning("Duplicate tokens found within a named vector of `token.list`. Dropping duplicates.")
token.listsaved <- token.list
## swap out spaces as necessary
subwrds <- grep(".\\s+.", unlist(token.list), value=TRUE)
if (length(subwrds > 0)){
tph <-'termcoplaceholdertermco'
text.var <- .mgsub(subwrds, gsub("\\s+", tph, subwrds), tolower(text.var), perl=FALSE)
if (list_list) {
token.list <- lapply(token.list, function(a) {lapply(a, function(x) gsub("\\s+", tph, x))})
} else {
token.list <- lapply(token.list, function(x) gsub("\\s+", tph, x))
}
}
if (is.null(grouping.var)) {
G <- "all"
} else {
if (is.list(grouping.var)) {
if (is.data.frame(grouping.var)){
G <- colnames(grouping.var)
} else {
m <- gsub("(\'|\")\\]+$", "", unlist(as.character(substitute(grouping.var))[-1]))
G <- sapply(strsplit(m, "\\$|(\\[\\s*,?\\s*(\'|\"))", fixed = FALSE), function(x) {
x[length(x)]
})
}
} else {
if (isTRUE(grouping.var)) {
amodel <- TRUE
G <- "id"
} else {
G <- as.character(substitute(grouping.var))
G <- G[length(G)]
}
}
}
if (is.null(grouping.var)) {
grouping <- rep("all", length(text.var))
} else {
if (isTRUE(grouping.var)) {
grouping <- seq_along(text.var)
} else {
if (is.list(grouping.var) & length(grouping.var) > 1) {
grouping <- grouping.var
} else {
grouping <- unlist(grouping.var)
}
}
}
if (!missing(group.names)) {
G <- group.names
}
## choose stem or not version of doc-term-mat function
if (stem) {
make_dtm <- gofastr::q_dtm_stem
} else {
make_dtm <- gofastr::q_dtm
}
## Build data.frames with text and vars
DF <- data.frame(text.var, check.names = FALSE, stringsAsFactors = FALSE)
DF[G] <- grouping
## create DTM
dtm <- make_dtm(text.var, paste2(DF[G], sep = "termcosepperiodsepsepseptermco"), remove_punct = !keep.punctuation, ...)
nms <- colnames(dtm)
n.tokens <- slam::row_sums(dtm)
## respond based on hierarchical terms
if (list_list) {
## make sure for hierarchical terms that each observation is also a group
if(nrow(DF) != nrow(unique(DF[,G, drop=FALSE]))) {
stop("In order to run nested `token.list` then `grouping.var` must place every observation in its own group.")
}
## make sure the tokens conform to regex stadards
token.list <- lapply(token.list, token_lister_check)
## Auto create a map for same named term lists and
## add ending number to distinguish
token.nms <- lapply(token.list, names)
token.lens <- sapply(token.nms, length)
token.nms <- unlist(token.nms)
if (any(duplicated(token.nms))){
map <- as.list(unique(token.nms))
names(map) <- unique(token.nms)
for(i in names(map)){
suffix <- seq_len(sum(token.nms == i))
if (length(suffix) == 1) {
replacements <- i
map[i] <- NULL
} else {
replacements <- paste(i, seq_len(sum(token.nms == i)), sep = "termcosepsepsepseptermco")
map[[i]] <- paste(i, seq_len(sum(token.nms == i)), sep = "termcosepsepsepseptermco")
}
token.nms[token.nms == i] <- replacements
}
token.list <- Map(function(x, y) {
names(x) <- y
x
}, token.list, split(token.nms, rep(seq_along(token.lens), token.lens)))
auto_map <- TRUE
}
inds <- seq_along(text.var)
out_list <- vector(mode = "list", length = length(token.list))
for (i in seq_along(token.list)){
out_list[[i]] <- lapply(token.list[[i]], function(x){
x <- x[x %in% nms]
unname(slam::row_sums(dtm[, x]))
})
if (length(token.list[[i]]) > 1) {
locs <- which(rowSums(do.call(cbind, out_list[[i]])) > 0)
} else {
locs <- which(out_list[[i]][[1]] > 0)
}
dtm_locs <- !dtm[['i']] %in% locs
dtm[['i']] <- dtm[['i']][dtm_locs]
dtm[['j']] <- dtm[['j']][dtm_locs]
dtm[['v']] <- dtm[['v']][dtm_locs]
}
out <- data.table::as.data.table(data.frame(
do.call(cbind.data.frame, out_list),
stringsAsFactors = FALSE,
check.names = FALSE
))
} else {
## check tht token.list is a named list &
## tokens are words, numbers or punctuation
token.list <- token_lister_check(token.list)
out <- data.table::as.data.table(data.frame(lapply(token.list, function(x){
x <- x[x %in% nms]
unname(slam::row_sums(dtm[, x]))
}), stringsAsFactors = FALSE, check.names = FALSE))
}
grpv <- stats::setNames(do.call(rbind.data.frame, strsplit(rownames(dtm), "termcosepperiodsepsepseptermco")), G)
if (isTRUE(grouping.var)) grpv['id'] <- seq_along(grpv[['id']])
out <- data.table::data.table(grpv, n.tokens, out)
class(out) <- c("token_count", "term_count", "tbl_df", "tbl", "data.frame")
if(isTRUE(list_list)) class(out) <- c("hierarchical_term_count", class(out))
text <- new.env(hash=FALSE)
text[["text.var"]] <- text.var
regex <- new.env(hash=FALSE)
regex[["term.list"]] <- token.list
attributes(out)[["group.vars"]] <- G
if (isTRUE(list_list)) {
attributes(out)[["token.vars"]] <- unlist(lapply(token.list, names))
} else {
attributes(out)[["token.vars"]] <- names(token.list)
}
attributes(out)[["text.var"]] <- text
attributes(out)[["model"]] <- amodel
attributes(out)[["pretty"]] <- pretty
count <- new.env(hash=FALSE)
count[["count"]] <- out
attributes(out)[["weight"]] <- "count"
attributes(out)[["counts"]] <- count
attributes(out)[["tokens"]] <- token.listsaved
if(isTRUE(list_list)) attributes(out)[["hierarchical_terms"]] <- lapply(token.list, names)
if (isTRUE(auto_map)){
message("Collapsing duplicate `token.list` columns.")
out <- collapse_tags(out, map, ...)
}
attributes(out)[["token.vars"]] <- unique(gsub('termcosepsepsepseptermco\\d+$', '', attributes(out)[["token.vars"]]))
attributes(out)[["metatags"]] <- tags2meta(attributes(out)[["token.vars"]],
meta.sep = meta.sep, meta.names = meta.names)
out
}
## #' Prints a token_count Object
## #'
## #' Prints a token_count object.
## #'
## #' @param x The token_count object.
## #' @param digits The number of digits displayed.
## #' @param weight The weight type. Currently the following are available:
## #' \code{"proportion"}, \code{"percent"}. See \code{\link[termco]{weight}} for
## #' additional information.
## #' @param zero.replace The value to replace zero count elements with; defaults
## #' to \code{"0"}.
## #' @param pretty logical. If \code{TRUE} the counts print in a pretty fashion,
## #' combining count and weighted information into a single display.
## #' \code{pretty} printing can be permanently removed with
## #' \code{\link[termco]{as_count}}.
## #' @param \ldots ignored
## #' @method print token_count
## #' @export
## print.token_count <- function(x, digits = 2, weight = "percent",
## zero.replace = "0", pretty = getOption("termco_pretty"), ...) {
##
## n.tokens <- count <- NULL
## if (is.null(pretty)) pretty <- TRUE
## if (weight == "count") pretty <- FALSE
##
## print_order <- c(attributes(x)[['group.vars']], 'n.tokens', attributes(x)[['token.vars']])
##
## val <- validate_token_count(x)
## if (!isTRUE(val)) {
##
## termcols <- attributes(x)[["token.vars"]]
## wrdscol <- any(colnames(x) %in% 'n.tokens')
##
## if (wrdscol & !is.null(termcols) && any(colnames(x) %in% termcols)) {
##
## termcols <- colnames(x)[colnames(x) %in% termcols]
##
## } else {
##
## return(print(rm_class(x, "token_count")))
##
## }
## } else {
##
## termcols <- attributes(x)[["token.vars"]]
## }
##
## coverage <- sum(cov <- rowSums(x[, termcols]) != 0)/length(cov)
##
## start <- Sys.time()
## if (is.count(x) & pretty & attributes(x)[["pretty"]]) {
##
## tall <- tidyr::gather_(x, "term", "count", termcols)
## tall_weighted <- dplyr::mutate(tall, count = comb(count, n.tokens, digits = digits,
## zero.replace = zero.replace, weight = weight))
##
## x <- tidyr::spread_(tall_weighted, "term", "count")
## }
## ptime <- difftime(Sys.time(), start)
##
## class(x) <- class(x)[!class(x) %in% "token_count"]
## cat(sprintf("Coverage: %s%%", 100 * round(coverage, 4)), "\n")
##
## print(x[, print_order])
##
## ask <- getOption("termco_pretty_ask")
## if(is.null(ask)){
## ask <- TRUE
## }
##
## if(ask && ptime > .61 && interactive()){
## message(paste0(paste(rep("=", 70), collapse = ""), "\n"),
## "\nYour `token_count` object is larger and is taking a while to print.\n",
## "You can reduce this time by using `as_count` or setting:\n\n`options(termco_pretty = FALSE)`\n\n",
## "Would you like to globally set `options(termco_pretty = FALSE)` now?\n")
## ans <- utils::menu(c("Yes", "No", "Not Now"))
## switch(ans,
## `1` = {options(termco_pretty = FALSE)
## options(termco_pretty_ask = FALSE)},
## `2` = {options(termco_pretty_ask = FALSE)},
## `3` = {options(termco_pretty_ask = TRUE)}
## )
## }
##
## }
token_lister_check <- function(token.list){
token_check <- grepl("^([a-z' ]+|[0-9.]+|[[:punct:]]+)$", unlist(token.list))
if (!all(token_check)) {
warning(
paste0(
"The following tokens did not conform to expected standards:\n",
paste(paste0(" -", unlist(token.list)[which(!token_check)]), collapse = "\n")
)
)
}
nms <- names(token.list)
names(token.list)[sapply(nms, identical, "")] <- make.names(seq_len(length(nms[sapply(nms, identical, "")])))
if (!is.list(token.list)) {
warning("Expecting a named list for `token.list`; coercing to list.")
token.list <- as.list(token.list)
if (is.null(names(token.list))) token.list <- stats::setNames(token.list, token.list)
}
token.list
}
validate_token_count <- function(x, warn = FALSE){
terms <- ifelse(inherits(x, 'token_count'), "token.vars", "term.vars")
nwords <- ifelse(inherits(x, 'token_count'), "n.tokens", "n.words")
type <- ifelse(inherits(x, 'token_count'), "token", "term")
nms2 <- unlist(list(attributes(x)[["token.vars"]], "n.tokens"))
nms <- unlist(list(attributes(x)[["group.vars"]], nms2))
check <- all(nms %in% colnames(x)) && all(sapply(x[, nms2], is.numeric))
check2 <- all(sapply(c("group.vars", "token.vars", "weight", "pretty"), function(y){
!is.null(attributes(x)[[y]])
}))
check3 <- !any(colnames(x) %in% c(nms2, nms, "n.tokens"))
if (!check | !check2 | check3) {
if (isTRUE(warn)){
warning("Does not appear to be a `token_count` object.\n",
" Has the object or column names been altered/added?",
immediate. = TRUE
)
}
return(FALSE)
}
TRUE
}
#' Plots a token_count object
#'
#' Plots a token_count object.
#'
#' @param x The token_count object.
#' @param labels logical. If \code{TRUE} the cell count values will be included
#' on the heatmap.
#' @param low The color to be used for lower values.
#' @param high The color to be used for higher values.
#' @param grid The color of the grid (Use \code{NA} to remove the grid).
#' @param label.color The color to make labels if \code{labels = TRUE}.
#' @param label.size The size to make labels if \code{labels = TRUE}.
#' @param label.digits The number of digits to print if labels are printed.
#' @param weight The weight to apply to the cell values for gradient fill.
#' Currently the following are available:
#' \code{"proportion"}, \code{"percent"}, and \code{"count"}. See
#' \code{\link[termco]{weight}} for additional information.
#' @param \ldots ignored
#' @method plot token_count
#' @export
plot.token_count <- function(x, labels = FALSE, low ="white",
high = "red", grid = NA, label.color = "grey70", label.size = 3,
label.digits = if(weight=="count"){0} else {2}, weight = "percent", ...){
group <- attributes(x)[["group.vars"]]
if (weight == "count") {
y <- x
} else {
y <- weight(x, weight = weight)
}
y[["group.vars"]] <- paste2(y[, group], sep = "_")
y[["group.vars"]] <- factor(y[["group.vars"]], levels = rev(y[["group.vars"]]))
y <- y[!colnames(y) %in% group]
vars <- colnames(y)[!colnames(y) %in% c("group.vars", "n.tokens")]
dat <- tidyr::gather_(y, "terms", "values", vars)
if (isTRUE(labels)){
values <- NULL
fact <- ifelse(weight == "percent", 100, 1)
dat <- dplyr::mutate(dat, labels = values/fact, label.digits)
}
out <- ggplot2::ggplot(dat, ggplot2::aes_string(y = "group.vars", x = "terms", fill = "values")) +
ggplot2::theme_bw() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 90, vjust = .5, hjust = 1),
panel.grid.minor = ggplot2::element_blank(),
panel.grid.major = ggplot2::element_blank(),
panel.border = ggplot2::element_rect(colour = "grey80"),
legend.key.width = grid::unit(.25, 'cm'),
legend.key.height = grid::unit(1, 'cm')
) +
ggplot2::xlab("Terms Categories") +
ggplot2::ylab("Groups") +
ggplot2::geom_tile(color = grid)
if (weight == "percent"){
out <- out +
ggplot2::scale_fill_gradient(high = high, low = low, name = "Percent",
labels = function(x) paste0(x, "%"))
} else {
out <- out +
ggplot2::scale_fill_gradient(high = high, low = low,
name = gsub("(\\w)(\\w*)","\\U\\1\\L\\2", weight, perl=TRUE))
}
if (isTRUE(labels)){
out <- out +
ggplot2::geom_text(ggplot2::aes_string(label = 'labels'),
color = label.color, size = label.size)
}
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.