In trinker/textreport: Auto Generate Visual Descriptive Text Reports

if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh(c(
    "trinker/termco", 
    "trinker/tagger", 
    "trinker/formality",
    "trinker/termco",
    "trinker/syllable", 
    "trinker/readability",
    "trinker/textshape",    
    "trinker/sentimentr",
    "trinker/lexr"
))
pacman::p_load(ggplot2, dplyr, knitr, pander, stringi, scales)
opts_chunk$set(warning = FALSE, echo=FALSE)

dat <- readRDS("report_data.rds")
grp <-attributes(dat)[["group"]]
grpcmb <- attributes(dat)[["group.combined"]]
txt <- attributes(dat)[["text"]]
````


# Frequent Terms

```r
minchar <- 7
nrows <- 25
hfreqs <- frequent_terms(dat[[txt]], nrows, min.char=minchar) 

hfreqs %>%
    plot(plot=FALSE) +
    ggtitle(sprintf("~%s Highest Frequency Terms (>= %s characters)", nrows, minchar))

terms <- hfreqs %>%
    {.[[1]]}

suppressWarnings(term_count(dat[[txt]], dat[grpcmb], terms)) %>%
    plot(plot=FALSE) +
    ggtitle(sprintf("~%s Highest Frequency Terms by\n Group (>= %s characters)", nrows, minchar))

nterms <- 10
lex <- lexical_dispersion_plot(dat[[txt]], terms[1:nterms], dat[grpcmb], plot=FALSE)

lex + 
    ggtitle(sprintf("%s Highest Frequency Terms by\n Group (>= %s characters)", nterms, minchar))

Important Bigrams

len_ngram <- 20
ngrams_dat <- ngram_collocations(dat[[txt]], len_ngram)
out <- ngrams_dat %>%
    plot(plot=FALSE)

out[[1]] + ggtitle(sprintf("~%s Important Bi-Gram Collocations", len_ngram))
out[[2]] + ggtitle(sprintf("~%s Important Bi-Gram Collocations Measures", len_ngram))

terms2 <- ngrams_dat %>%
    select(1:2) %>%
    rowwise() %>%
    mutate(terms = paste(term1, term2)) %>%
    select(3) %>%
    unlist(use.names=FALSE)

suppressWarnings(term_count(dat[[txt]], dat[grpcmb], terms2)) %>%
    plot(plot=FALSE) +
    ggtitle(sprintf("~%s Important Bi-Gram Collocations by Group", len_ngram))

nterms2 <- 10
lex2 <- lexical_dispersion_plot(dat[[txt]], terms2[1:nterms2], dat[grpcmb], plot=FALSE)

lex2 + 
    ggtitle(sprintf("%s Important Bi-Gram\nCollocations Dispersion by Group", nterms, minchar))

Descriptives

dat[["counts"]] <- stringi::stri_count_words(dat[[txt]])
dat[["counts"]][is.na(dat[["counts"]])] <- 0

panderOptions('table.alignment.default',
     function(df) ifelse(sapply(df, is.numeric), 'right', 'left'))

readability_word_stats_by(dat[[txt]],  dat[grpcmb], group.names = grpcmb) %>%
    pander()

Turn of Talk

dat[["counts"]] <- stringi::stri_count_words(dat[[txt]])
dat[["counts"]][is.na(dat[["counts"]])] <- 0
dat[["tot"]] <- seq_len(nrow(dat))
dat[["group"]] <- dat[[grp]]
MEAN <- mean(dat[["counts"]])
SD <- sd(dat[["counts"]])
MAX <- max(dat[["counts"]])
OUT <- MEAN+3*SD
above <- 2*MAX/100

ggplot(dat, aes(x = tot, weight = counts, fill=group)) +
    geom_hline(yintercept=MEAN, alpha=.4, color="blue") +
    geom_hline(yintercept=MEAN+3*SD, alpha=.4, color="red") +    
    geom_bar(binwidth=1) +
    annotate("text", x = 1, y = MEAN + above, hjust = 0, size=3,
        label=paste0("bar(x) ==", round(MEAN, 0)), parse=T) +
    annotate("text", x = 1, y = OUT + above, hjust = 0, size=3,
        label=paste0("3 sd =", round(OUT, 0))) +    
    ylab("Count") +
    xlab("Turn of Talk") + 
    theme_minimal() +
    theme(
        panel.grid = element_blank(),
        legend.position="bottom"
    ) + 
    scale_y_continuous(expand = c(0, 0), limits = c(0, MAX + MAX*.04))  +
    scale_x_continuous(expand = c(0, 0), limits = c(0, nrow(dat)))  +    
    guides(fill=guide_legend(
            title=grp,
            nrow = ceiling(length(unique(dat[[grp]]))/6), byrow=TRUE
    ))

Sentiment

sent <- sentiment_by(dat[[txt]], dat[grpcmb], group.names = grpcmb)

plot(uncombine(sent)) +
    ggtitle("Smoothed Sentiment Over the Length of the Text")

plot(sent, plot=FALSE) +
    ggtitle("Sentiment Distribution by Group")

Readability

readability_out <- readability(dat[[txt]], dat[grpcmb], group.names = grpcmb)

plot(readability_out) +
    ggtitle("Readability Distribution by Group")

Formality

formality_out <- formality(dat[[txt]], dat[grpcmb], group.names = grpcmb)

plots <- plot(formality_out, plot=FALSE) 
plots[[1]] + 
    ggtitle("Formality Score by Group")
plots[[2]] + 
    ggtitle("Percent Contextual vs. Formal Words by Group")
plots[[3]] + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    ggtitle("Contextual vs. Formal Part of Speech by Group")

Sentence Type

dat[["endmark"]] <- unlist(stringi::stri_extract_all_regex(dat[[txt]], "((\\.{3})|([?.;:!|]))$"))
key <- setNames(c('...', '?', '.', ';', ':', '!', '|'), 
    c("Incomplete", 'Question', 'Statement', 'Joining', 'Joining', 'Exclamation', 'Incomplete'))
dat[["type"]] <- names(key)[match(dat[["endmark"]], key)]
plot_counts(dat[["type"]],item.name = "Sentence Type") + 
    ggtitle("Percentage of Sentence Types")

ord <- names(sort(table(dat[["type"]]), TRUE))

dat %>%
    group_by_('group', "type") %>%
    filter(!is.na(type)) %>%
    summarize(n = n()) %>%
    group_by_('group') %>%
    mutate(
        prop = n/sum(n),
        type = factor(type, levels=ord)
    ) %>%
    ggplot(aes(x=type, y=group, fill = prop)) +
        geom_tile() +
        scale_fill_gradient(name="", high = "darkred", low = "white", label = scales::percent) +
        theme_bw() +
        theme(
            panel.grid = element_blank(),
            legend.key.width = grid::unit(.25,"cm"),
            legend.key.height = grid::unit(1.5,"cm")
        ) +
        ylab(NULL) +
        xlab("Sentence Type") +
        ggtitle("Percentage Usage of Sentence Type by Group")