&twocol

# set global chunk options
library(reports); library(slidify); library(knitcitations); library(knitr); library(qdap)
opts_chunk$set(cache=FALSE, tidy = FALSE)


#Reading in your own bib file:
bib <- read.bibtex(dir()[tools::file_ext(dir()) == "bib"][1])
#cite in text using `r citet(bib[1])`
internal <- TRUE

About Me

*** =right

r VS(2) - Former First Grade Teacher - Literacy PhD Student - Quantitatively Bent - Accidental Programmer

*** =left

r IM2("norah.png", width=350)

*** =pnotes

Research Interests

--- &twocol

Why R?

*** =left

r VS(2) - Cutting Edge
- Powerful - Visualization

r VS(4) r HS(4) Everything can be quantified...

*** =right

r VS(2) r IM2("r.jpg", width=350)

--- {class: class, tpl: tabs}

Why qdap?

*** {class: active, id: qdap}

r IM2("whyqdap.png")

*** {id: Birth}

Frustration

r IM2("fustration.jpg", width = 400)

*** {id: Affordances}

Affordances


Recomendations


  • r IM2("logo_rstudio.jpg", width = 200, center = FALSE, link = "http://www.rstudio.com/")
  • r IM2("github-social.png", width = 200, center = FALSE, link ="https://github.com/")
  • r IM2("knitr.png", width = 200, center = FALSE, link ="http://yihui.name/knitr/") + r IM2("latex.png", width = 160, center = FALSE, link = "http://en.wikipedia.org/wiki/LaTeX") r HS(5) r HR("C:/Users/trinker/Desktop/proximity/REPORT/proximity_measure.Rnw", ".Rmd") r HS(5)``r HR("C:/Users/trinker/Desktop/proximity/REPORT/proximity_measure.pdf", ".pdf")
  • r IM2("ggplot.png", width = 200, center = FALSE, link = "http://docs.ggplot2.org/current/")


    press p

*** =pnotes

r IM2("ggplot2demo.png")


Let's Dig In


r IM2("dig.jpg")

*** =pnotes

Agenda

  1. Installing qdap
  2. Projects
  3. Read In Data
  4. Word Counts and Descriptive Statistics
  5. Word Measures and Scoring
  6. Qualitative Coding System
  7. Visualizing Discourse Data
  8. Discussion

Installing qdap

# install.packages("devtools", eval = FALSE)
library(devtools)
install_github(c("slidify", "slidifyLibraries"), "ramnathv", ref = "dev")
install_github("knitcitations", "cboettig")
install_github(c("reports", "qdapDictionaries", "qdap"), "trinker")
install_github("ggthemes", "jrnold")
install.packages("scales")

invisible(lapply(c("qdap", "ggplot2", "ggthemes", "scales", "grid"), 
    require, character.only = TRUE))

r VS(2)

https://github.com/trinker/qdap

--- .YT yt:chQlpEj8g2Q &youtube

Projects

*** =pnotes

r IF("vid1.html")


General qdap Function Format


Function(Text_Variable, list(Grouping_Variables))

r VS(2)

with(Data_Set, Function(Text_Variable, list(Grouping_Variables)))

Read In Data


Read In Data

r IM2("transcript.png", width= 850)


Read In Data

doc1 <- system.file("extdata/transcripts/trans1.docx", package = "qdap")
dat1 <- read.transcript(doc1)
truncdf(dat1, 50)

r VS(2) Plenty of parsing tools to clean up!!!


Our Data Set

DATA
suppressMessages(library(xtable))
print(xtable(DATA), type="html")

Word Counts and Descriptive Statistics

  1. Word Frequency Matrix
  2. Word Stats
  3. Term Counts
  4. Question Types
  5. Parts of Speech
  6. Syllablication

Word Frequency Matrix

with(DATA, wfm(state, person))[1:14, ]

Word Frequency Matrix

plot(with(DATA, wfm(state, person)), values = TRUE, plot = FALSE) +
   coord_flip()

Word Frequency Matrix (Correlations)

dat2 <- wfm(DATA$state, seq_len(nrow(DATA)))
qheat(cor(t(dat2)), low = "yellow", high = "red",
    grid = "grey90", diag.na = TRUE, by.column = NULL)

Word Stats (1 of 3)

(desc_wrds <- with(mraja1spl, word_stats(dialogue, person, tot = tot)))
desc_wrds2 <- with(mraja1spl, word_stats(desc_wrds, person, tot = tot, digits = 1))
desc_wrds2$gts[, c(1, 2:9)]

Word Stats (2 of 3)

desc_wrds2$gts[, c(1, 10:19)]

Word Stats (3 of 3)

desc_wrds2$gts[, c(1, 20:26)]

Word Stats Plot

plot(desc_wrds, label = TRUE, high="red")

Term Counts

ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    "good",
    the_words = c("the", " the ", " the", "the ")
)
out <- with(raj.act.1,  termco(dialogue, person, ml2))

*** =pnotes

r VS(2) *Press p

ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    "good",
    the_words = c("the", " the ", " the", "the ")
)

Term Counts

out

*** =pnotes

ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    "good",
    the_words = c("the", " the ", " the", "the ")
)

Term Counts Plot

plot(out, high = "red", low = "yellow", label = TRUE)

Question Types

(x <- with(mraja1spl, question_type(dialogue, person)))

Question Types Plot

plot(x)

Parts of Speech

(posbydat <- with(DATA, pos_by(state, list(adult, sex))))
load("pos.RData")
posbydat

r VS(3) r HR("http://faculty.washington.edu/dillon/GramResources/penntable.html", "Penn Treebank Project (1991)")

*** =pnotes

posbydat[["POStagged"]]

Parts of Speech Plot

plot(posbydat, label = TRUE)

Word Measures and Scoring

  1. Readability
  2. Formality
  3. Polarity

Readability

  1. Automated Readability Index
  2. Coleman Liau
  3. SMOG
  4. Flesch Kincaid
  5. Fry
  6. Linsear Write

Readability

with(rajSPLIT, coleman_liau(dialogue, list(fam.aff)))

Formality

Heylighen & Dewaele(1999a, 1999b, 2002)


$$ F = 50(\frac{n_{f}-n_{c}}{N} + 1) $$

Where:

$$ f = \left {noun, \;adjective, \;preposition, \;article\right } $$ $$ c = \left {pronoun, \;verb, \;adverb, \;interjection\right } $$ $$ N = \sum{(f \;+ \;c \;+ \;conjunctions)} $$


Formality

(form <- with(raj, formality(dialogue, act)))
(form <- with(raj, formality(rajPOS, act)))

Formality Plot

plot(form, bar.colors=c("Set2", "RdBu"))

Polarity

(poldat <- with(mraja1spl, polarity(dialogue, list(sex, fam.aff, died))))

plot(poldat)

Qualitative Coding System

Apply codes to:


codes <- qcv(AA, BB, CC)
X <- cm_df.transcript(DATA$state, DATA$person, file="DATA.txt")
sam:

     1        2  3    4   5   6   
     Computer is fun. Not too fun.
greg:

     7  8    9    10   11   
     No it's not, it's dumb.
teacher:

     12   13     14 15 
     What should we do?
sam:

     16  17    18 19     
     You liar, it stinks!

r IM2("transcript2.png", width=800) r VS(1) Coding time spans works similarly...

press p

*** =pnotes

x <- list(
    transcript_time_span = qcv(00:00 - 1:12:00),
    A = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 
        12.00:14.00, 00.51.00:00.59.00"),
    B = qcv(terms = "2.40, 3.01:3.02, 5.01, 6.02:7.00,
        9.00, 30.00:39.00, 1.12.00:1.19.01"),
    C = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 17.01")
)

After Reading Qualitative Codes...

(y <- cm_2long(x))

Gantt Plot of Codes

plot(y)

Summary of Codes

summary(y)
plot(summary(y))

Visualizing Discourse Data

  1. Lexical Dispersion Plot
  2. Word Cloud
  3. Turn of Talk Plot
  4. Venn Diagram
  5. Word Network Plot

Lexical Dispersion Plot

with(rajSPLIT , dispersion_plot(dialogue, c("love", "night"),
    grouping.var = list(fam.aff, sex), rm.vars = act))

Lexical Dispersion Plot

wrds <- word_list(pres_debates2012$dialogue, stopwords = Top200Words)
wrds2 <- spaste(wrds[["rfswl"]][["all"]][, "WORD"])
wrds2 <- c(" governor~~romney ", wrds2[-c(3, 12)])
with(pres_debates2012 , dispersion_plot(dialogue, wrds2, rm.vars = time, color="black", bg.color="white"))

Word Cloud (Colored Terms)

terms <- list(
    I = c("i", "i'm"),
    mal = qcv(stinks, dumb, distrust),
    articles = qcv(the, a, an),
    pronoun = qcv(we, you)
)

with(DATA, trans_cloud(state, target.words=terms,
    cloud.colors=qcv(red, green, blue, black, gray65),
    expand.target=FALSE, proportional=TRUE, legend=c(names(terms),
    "other")))
 
terms <- list(
    I = c("i", "i'm"),
    mal = qcv(stinks, dumb, distrust),
    articles = qcv(the, a, an),
    pronoun = qcv(we, you)
)

with(DATA, trans_cloud(state, target.words=terms,
    cloud.colors=qcv(red, green, blue, black, gray65),
    expand.target=FALSE, proportional=TRUE, legend=c(names(terms),
    "other")))

Word Cloud Plot (Colored Terms)

with(DATA, trans_cloud(state, target.words=terms, 
    cloud.colors=qcv(red, green, blue, black, gray65),
    expand.target=FALSE, proportional=TRUE, legend=c(names(terms),
    "other"), max.word.size = 8, min.word.size = .5))

Gradient Cloud

DATA2 <- DATA
DATA2$state <- space_fill(DATA$state, c("is fun", "too fun", "you liar"))
gradient_cloud(DATA$state, DATA$sex, title="Houghton Colors", 
    max.word.size = 8, min.word.size = .01, X ="purple" , Y = "yellow")

Gradient Cloud

gradient_cloud(DATA2$state, DATA2$sex, title="Gender Word Use", 
    max.word.size = 8, min.word.size = .01, X ="red" , Y = "blue")

Turn of Talk Plot

r VS(1)

tot_plot(mraja1, "dialogue", grouping.var = c("sex", "fam.aff"), tot=FALSE, plot=FALSE)+
    scale_fill_brewer(palette="Set1") +
    geom_hline(aes(yintercept=mean(word.count))) +
    geom_hline(aes(yintercept=mean(word.count) + (2 *sd(word.count)))) +
    geom_hline(aes(yintercept=mean(word.count) + (3 *sd(word.count)))) +
    geom_text(parse=TRUE, hjust=0, vjust=0, size = 3, aes(x = 2, 
        y = mean(word.count) + 2, label = "bar(x)")) +
    geom_text(hjust=0, vjust=0, size = 3, aes(x = 1, 
        y = mean(word.count) + (2 *sd(word.count)) + 2, label = "+2 sd")) +
    geom_text(hjust=0, vjust=0,  size = 3, aes(x = 1, 
        y = mean(word.count) + (3 *sd(word.count)) + 2, label = "+3 sd")) +
    ggtitle("Romeo & Juliet:\nAct 1 Turns of Talk")

Venn Diagram

with(DATA , trans_venn(state, person, legend.location = "topright"))

Word Network Plot

word_network_plot(text.var=DATA$state, DATA$person, stopwords=NULL)

Discussion

  1. How might qdap + R fit into your workflow?
  2. What do you want to know more about?
  3. Are there any points that need to be clarified?


trinker/qdap documentation built on Sept. 30, 2020, 6:28 p.m.