if (!require("pacman")) install.packages("pacman") pacman::p_load_gh(c( "trinker/termco", "trinker/tagger", "trinker/formality", "trinker/termco", "trinker/syllable", "trinker/readability", "trinker/textshape", "trinker/sentimentr", "trinker/lexr" )) pacman::p_load(ggplot2, dplyr, knitr, pander, stringi, scales) opts_chunk$set(warning = FALSE, echo=FALSE)
dat <- readRDS("report_data.rds") grp <-attributes(dat)[["group"]] grpcmb <- attributes(dat)[["group.combined"]] txt <- attributes(dat)[["text"]] ```` # Frequent Terms ```r minchar <- 7 nrows <- 25 hfreqs <- frequent_terms(dat[[txt]], nrows, min.char=minchar) hfreqs %>% plot(plot=FALSE) + ggtitle(sprintf("~%s Highest Frequency Terms (>= %s characters)", nrows, minchar))
terms <- hfreqs %>% {.[[1]]} suppressWarnings(term_count(dat[[txt]], dat[grpcmb], terms)) %>% plot(plot=FALSE) + ggtitle(sprintf("~%s Highest Frequency Terms by\n Group (>= %s characters)", nrows, minchar))
nterms <- 10 lex <- lexical_dispersion_plot(dat[[txt]], terms[1:nterms], dat[grpcmb], plot=FALSE) lex + ggtitle(sprintf("%s Highest Frequency Terms by\n Group (>= %s characters)", nterms, minchar))
len_ngram <- 20 ngrams_dat <- ngram_collocations(dat[[txt]], len_ngram) out <- ngrams_dat %>% plot(plot=FALSE) out[[1]] + ggtitle(sprintf("~%s Important Bi-Gram Collocations", len_ngram)) out[[2]] + ggtitle(sprintf("~%s Important Bi-Gram Collocations Measures", len_ngram))
terms2 <- ngrams_dat %>% select(1:2) %>% rowwise() %>% mutate(terms = paste(term1, term2)) %>% select(3) %>% unlist(use.names=FALSE) suppressWarnings(term_count(dat[[txt]], dat[grpcmb], terms2)) %>% plot(plot=FALSE) + ggtitle(sprintf("~%s Important Bi-Gram Collocations by Group", len_ngram))
nterms2 <- 10 lex2 <- lexical_dispersion_plot(dat[[txt]], terms2[1:nterms2], dat[grpcmb], plot=FALSE) lex2 + ggtitle(sprintf("%s Important Bi-Gram\nCollocations Dispersion by Group", nterms, minchar))
dat[["counts"]] <- stringi::stri_count_words(dat[[txt]]) dat[["counts"]][is.na(dat[["counts"]])] <- 0 panderOptions('table.alignment.default', function(df) ifelse(sapply(df, is.numeric), 'right', 'left')) readability_word_stats_by(dat[[txt]], dat[grpcmb], group.names = grpcmb) %>% pander()
dat[["counts"]] <- stringi::stri_count_words(dat[[txt]]) dat[["counts"]][is.na(dat[["counts"]])] <- 0 dat[["tot"]] <- seq_len(nrow(dat)) dat[["group"]] <- dat[[grp]] MEAN <- mean(dat[["counts"]]) SD <- sd(dat[["counts"]]) MAX <- max(dat[["counts"]]) OUT <- MEAN+3*SD above <- 2*MAX/100 ggplot(dat, aes(x = tot, weight = counts, fill=group)) + geom_hline(yintercept=MEAN, alpha=.4, color="blue") + geom_hline(yintercept=MEAN+3*SD, alpha=.4, color="red") + geom_bar(binwidth=1) + annotate("text", x = 1, y = MEAN + above, hjust = 0, size=3, label=paste0("bar(x) ==", round(MEAN, 0)), parse=T) + annotate("text", x = 1, y = OUT + above, hjust = 0, size=3, label=paste0("3 sd =", round(OUT, 0))) + ylab("Count") + xlab("Turn of Talk") + theme_minimal() + theme( panel.grid = element_blank(), legend.position="bottom" ) + scale_y_continuous(expand = c(0, 0), limits = c(0, MAX + MAX*.04)) + scale_x_continuous(expand = c(0, 0), limits = c(0, nrow(dat))) + guides(fill=guide_legend( title=grp, nrow = ceiling(length(unique(dat[[grp]]))/6), byrow=TRUE ))
sent <- sentiment_by(dat[[txt]], dat[grpcmb], group.names = grpcmb) plot(uncombine(sent)) + ggtitle("Smoothed Sentiment Over the Length of the Text")
plot(sent, plot=FALSE) + ggtitle("Sentiment Distribution by Group")
readability_out <- readability(dat[[txt]], dat[grpcmb], group.names = grpcmb) plot(readability_out) + ggtitle("Readability Distribution by Group")
formality_out <- formality(dat[[txt]], dat[grpcmb], group.names = grpcmb) plots <- plot(formality_out, plot=FALSE) plots[[1]] + ggtitle("Formality Score by Group") plots[[2]] + ggtitle("Percent Contextual vs. Formal Words by Group") plots[[3]] + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Contextual vs. Formal Part of Speech by Group")
dat[["endmark"]] <- unlist(stringi::stri_extract_all_regex(dat[[txt]], "((\\.{3})|([?.;:!|]))$")) key <- setNames(c('...', '?', '.', ';', ':', '!', '|'), c("Incomplete", 'Question', 'Statement', 'Joining', 'Joining', 'Exclamation', 'Incomplete')) dat[["type"]] <- names(key)[match(dat[["endmark"]], key)] plot_counts(dat[["type"]],item.name = "Sentence Type") + ggtitle("Percentage of Sentence Types")
ord <- names(sort(table(dat[["type"]]), TRUE)) dat %>% group_by_('group', "type") %>% filter(!is.na(type)) %>% summarize(n = n()) %>% group_by_('group') %>% mutate( prop = n/sum(n), type = factor(type, levels=ord) ) %>% ggplot(aes(x=type, y=group, fill = prop)) + geom_tile() + scale_fill_gradient(name="", high = "darkred", low = "white", label = scales::percent) + theme_bw() + theme( panel.grid = element_blank(), legend.key.width = grid::unit(.25,"cm"), legend.key.height = grid::unit(1.5,"cm") ) + ylab(NULL) + xlab("Sentence Type") + ggtitle("Percentage Usage of Sentence Type by Group")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.