term_count: Search For and Count Terms
In trinker/termco: Counts of Terms and Substrings

Description Usage Arguments Value Note Examples

term_count - Search a string by any number of grouping variables for categories (themes) of grouped root terms/substrings.

term_count(
  text.var,
  grouping.var = NULL,
  term.list,
  ignore.case = TRUE,
  pretty = ifelse(isTRUE(grouping.var), FALSE, TRUE),
  group.names,
  meta.sep = "__",
  meta.names = c("meta"),
  ...
)

`text.var`	The text string variable.
`grouping.var`	The grouping variable(s). Default `NULL` generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables. If `TRUE` an `id` variable is used with a `seq_along` the `text.var`.
`term.list`	A list of named character vectors. 'codeterm_count can be used in a hierarchical fashion as well; that is a list of regexes that can be passed and counted and then a second (or more) pass can be taken with a new set of regexes on only those rows/text elements that were left untagged (count `rowSums` is zero). This is accomplished by passing a `list` of `list`s of regexes. See Examples for the hierarchical terms section for a demonstration.
`ignore.case`	logical. If `FALSE`, the pattern matching is case sensitive and if `TRUE`, case is ignored during matching.
`pretty`	logical. If `TRUE` pretty printing is used. Pretty printing can be turned off globally by setting `options(termco_pretty = FALSE)`.
`group.names`	A vector of names that corresponds to group. Generally for internal use.
`meta.sep`	A character separator (or character vector of separators) to break up the term list names (tags) into that will generate an merge table attribute on the output that has the supplied tags and meta + sub tags as dictated by the separator breaks. Meta tags called in `tidy_counts`. Note that these tags can be set after `term_count`'s object is returned using the `set_meta_tags` function.
`meta.names`	A vector of names corresponding to the meta tags generated by `meta.sep`.
`...`	ignored.

Returns a tibble object of term counts by grouping variable.

Note that while a term_count object prints as a combination of integer counts and weighted (default percent of terms) in parenthesis the underlying object is actually a tibble of integer term/substring counts. The user can alter a term_count object to print as integer permanently using the as_count function. A percent Coverage also prints. This is the rate of grouping variables with no term found (i.e., rowSums is zero for terms). For more details on coverage see coverage.

## Not run: 
data(presidential_debates_2012)

discoure_markers <- list(
    response_cries = c("\\boh", "\\bah", "\\baha", "\\bouch", "yuk"),
    back_channels = c("uh[- ]huh", "uhuh", "yeah"),
    summons = "\\bhey",
    justification = "because"
)

(markers <- with(presidential_debates_2012,
    term_count(dialogue, list(person, time), discoure_markers)
))

print(markers, pretty = FALSE)
print(markers, zero.replace = "_")
plot(markers)
plot(markers, labels=TRUE)

# permanently remove pretty printing
(markers2 <- as_count(markers))

# manipulating the output in a dplyr chain
library(dplyr)

presidential_debates_2012 %>%
    with(., term_count(dialogue, list(person, time), discoure_markers)) %>%
    as_count()  # removes pretty print method (not necessary to manipulate)

presidential_debates_2012 %>%
    with(., term_count(dialogue, list(person, time), discoure_markers)) %>%
    mutate(totals = response_cries + back_channels + summons + justification) %>%
    arrange(-totals)

## hierarchical terms
trms <- frequent_terms(presidential_debates_2012[["dialogue"]])[[1]]

discoure_markers <- list(
    response_cries = c("\\boh", "\\bah", "\\baha", "\\bouch", "yuk"),
    back_channels = c("uh[- ]huh", "uhuh", "yeah"),
    summons = "hey",
    justification = "because"
)

dbl_list <- list(
    discoure_markers,
    setNames(as.list(trms[1:8]), trms[1:8]),
    setNames(as.list(trms[9:length(trms)]), trms[9:length(trms)])
)

x <- with(presidential_debates_2012,
    term_count(dialogue, TRUE, dbl_list)
)

coverage(x)

## Auto mapping hierarchical terms w/ duplicate names
trpl_list <- list(
    list(
        response_cries = c("\\boh", "\\bah", "\\baha", "\\bouch", "yuk"),
        back_channels = c("uh[- ]huh", "uhuh", "yeah"),
        summons = "hey",
        justification = "because"
    ),
    list(summons ='the'),
    list(summons = 'it', justification = 'ed\\s')
)

(x2 <- with(presidential_debates_2012, term_count(dialogue, TRUE, trpl_list)))

## get the pre-collapse hierarchical coverage
attributes(x2)[['pre_collapse_coverage']]

## End(Not run)

## External dictionaries
## Not run: 
## dictionary from quanteda
require(quanteda); require(textreadr)

## Laver. M. & Garry, J. (2000). Estimating Policy Positions from Political Texts. American
##   Journal of Political Science, 44 (3), 619-634.

dict_laver_garry <- textreadr::download("https://provalisresearch.com/Download/LaverGarry.zip") %>%
    unzip(exdir = tempdir()) %>%
    `[`(1) %>%
    dictionary(file = .)

lg <- as_term_list(dict_laver_garry)

presidential_debates_2012 %>%
     with(term_count(dialogue, list(time, person), lg)) %>%
     plot()

## End(Not run)

## Not run: 
## use with the qdapRegex package for feature extraction

if (!require("pacman")) install.packages("pacman")
pacman::p_load(qdapRegex, termco, dplyr, textshape, magrittr)

x <- c(
    "@hadley I like #rstats for #ggplot2 work.",
    "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats:
        http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
    "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization
        presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1",
    "fred is fred@foo.com and joe is joe@example.com - but @this is a",
    "twitter handle for twit@here.com or foo+bar@google.com/fred@foo.fnord",
    "hello world",
    "I went to Washington Heights, NY for food! ",
    "It's in West ven,PA, near Bolly Bolly Bolly, CA!",
    "I like Movies, PG13",
    'There is at UCLA', 
    'And at UB too', 
    'But UB is not UCLA.', 
    'It is like RSU',
    "Dr. Brend is mizz hart's in mrs. Holtz's.",
    "Where is mr. Bob Jr. and Ms. John Kennedy?",
    "I want $2.33 at 2:30 p.m. to go to A.n.p.",
    "She will send it A.S.A.P. (e.g. as soon as you can) said I.",
    "Hello world.", "In the U. S. A.",
    "Hello World (V. Raptor, 1986) bye",
    "Narcissism is not dead (Rinker, 2014)",
    "The R Core Team (2014) has many members.",
    paste("Bunn (2005) said, \"As for elegance, R is refined, tasteful, and",
        "beautiful. When I grow up, I want to marry R.\""),
    "It is wrong to blame ANY tool for our own shortcomings (Baer, 2005).",
    "Wickham's (in press) Tidy Data should be out soon.",
    "Rinker's (n.d.) dissertation not so much.",
    "I always consult xkcd comics for guidance (Foo, 2012; Bar, 2014).",
    "Uwe Ligges (2007) says, \"RAM is cheap and thinking hurts\"",
    " Mr. Bean bought 2 tickets 2-613-213-4567 or 5555555555 call either one",
    "43 Butter Rd, Brossard QC K0A 3P0 - 613 213 4567", 
    "Please contact Mr. Bean (613)2134567",
    "1.575.555.5555 is his #1 number",  
    "7164347566",
    "I like 1234567 dogs",
    "download file from http://example.com", 
    "this is the link to my website http://example.com", 
    "go to http://example.com from more info.",
    "Another url ftp://www.example.com",
    "And https://www.example.net",
    "twitter type: t.co/N1kq0F26tG",
    "still another one https://t.co/N1kq0F26tG :-)",
    "I'm getting 3:04 AM just fine, but...",
    "for 10:47 AM I'm getting 0:47 AM instead.",
    "no time here",
    "Some time has 12:04 with no AM/PM after it",
    "Some time has 12:04 a.m. or the form 1:22 pm",
    "download file from http://example.com", 
    "this is the link to my website http://example.com", 
    "go to http://example.com from more info.",
    "Another url ftp://www.example.com",
    "And https://www.example.net",
    "twitter type: t.co/N1kq0F26tG",
    "still another one https://t.co/N1kq0F26tG :-)",
    "are :-)) it 8-D he XD on =-D they :D of :-) is :> for :o) that :-/",
    "as :-D I xD with :^) a =D to =) the 8D and :3 in =3 you 8) his B^D was"
)


matches <- list(
    phone = grab('@rm_phone'),
    hash = grab('@rm_hash'),
    tag = grab('@rm_tag'),
    url = grab('@rm_url'),
    twitter_url = grab('@rm_twitter_url'),
    email = grab('@rm_email'),
    title = grab('@rm_title_name'),
    citation = grab('@rm_citation'),
    abbreviation = grab('@rm_abbreviation'),
    time = grab('@rm_time'),
    emoticon = grab('@rm_emoticon'),
    state = pastex(state.abb)
)

set.seed(10)
txt <- sample(x, 1000, TRUE)
(tcnt <- term_count(txt, TRUE, matches, ignore.case = FALSE))

as_dtm(tcnt)
textshape::tidy_dtm(as_dtm(tcnt))

## End(Not run)