About Me

- Former First Grade Teacher - Literacy PhD Student - Quantitatively Bent - Accidental Programmer



Research Interests

Why R?

- Cutting Edge
- Powerful - Visualization

Everything can be quantified...



Why qdap?

r IM2("whyqdap.png")

r IM2("fustration.jpg", width = 400)

  RStudio
  GitHub
  knitr + LaTeX
  ggplot2



Let's Dig In

r IM2("dig.jpg")

  1. Installing qdap
  2. Projects
  3. Read In Data
  4. Word Counts and Descriptive Statistics
  5. Word Measures and Scoring
  6. Qualitative Coding System
  7. Visualizing Discourse Data
  8. Discussion

Installing qdap

# install.packages("devtools", eval = FALSE)
install_github(c("slidify", "slidifyLibraries"), "ramnathv", ref = "dev")
install_github("knitcitations", "cboettig")
install_github(c("reports", "qdapDictionaries", "qdap"), "trinker")
install_github("ggthemes", "jrnold")

invisible(lapply(c("qdap", "ggplot2", "ggthemes", "scales", "grid"), 
    require, character.only = TRUE))

General qdap Function Format

Function(Text_Variable, list(Grouping_Variables))



with(Data_Set, Function(Text_Variable, list(Grouping_Variables)))

Read In Data

r IM2("transcript.png", width= 850)

Read In Data

doc1 <- system.file("extdata/transcripts/trans1.docx", package = "qdap")
dat1 <- read.transcript(doc1)
truncdf(dat1, 50)

Plenty of parsing tools to clean up!!!

Our Data Set

print(xtable(DATA), type="html")

Word Counts and Descriptive Statistics

  1. Word Frequency Matrix
  2. Word Stats
  3. Term Counts
  4. Question Types
  5. Parts of Speech
  6. Syllablication

Word Frequency Matrix

with(DATA, wfm(state, person))[1:14, ]

Word Frequency Matrix

plot(with(DATA, wfm(state, person)), values = TRUE, plot = FALSE) +

Word Frequency Matrix (Correlations)

dat2 <- wfm(DATA$state, seq_len(nrow(DATA)))
qheat(cor(t(dat2)), low = "yellow", high = "red",
    grid = "grey90", diag.na = TRUE, by.column = NULL)

Word Stats (1 of 3)

(desc_wrds <- with(mraja1spl, word_stats(dialogue, person, tot = tot)))
desc_wrds2 <- with(mraja1spl, word_stats(desc_wrds, person, tot = tot, digits = 1))
desc_wrds2$gts[, c(1, 2:9)]

Word Stats (2 of 3)

desc_wrds2$gts[, c(1, 10:19)]

Word Stats (3 of 3)

desc_wrds2$gts[, c(1, 20:26)]

Word Stats Plot

plot(desc_wrds, label = TRUE, high="red")

Term Counts

ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    the_words = c("the", " the ", " the", "the ")
out <- with(raj.act.1,  termco(dialogue, person, ml2))

ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    the_words = c("the", " the ", " the", "the ")

Term Counts


ml2 <- list(
    theme_1 = c(" the ", " a ", " an "),
    theme_2 = c(" I'" ),
    the_words = c("the", " the ", " the", "the ")

Term Counts Plot

plot(out, high = "red", low = "yellow", label = TRUE)

Question Types

(x <- with(mraja1spl, question_type(dialogue, person)))

Question Types Plot


Parts of Speech

(posbydat <- with(DATA, pos_by(state, list(adult, sex))))

Penn Treebank Project (1991)

Parts of Speech Plot

plot(posbydat, label = TRUE)

Word Measures and Scoring

  1. Readability
  2. Formality
  3. Polarity


  1. Automated Readability Index
  2. Coleman Liau
  3. SMOG
  4. Flesch Kincaid
  5. Fry
  6. Linsear Write


with(rajSPLIT, coleman_liau(dialogue, list(fam.aff)))


Heylighen & Dewaele(1999a, 1999b, 2002)

$$ F = 50(\frac{n_{f}-n_{c}}{N} + 1) $$


$$ f = \left {noun, \;adjective, \;preposition, \;article\right } $$ $$ c = \left {pronoun, \;verb, \;adverb, \;interjection\right } $$ $$ N = \sum{(f \;+ \;c \;+ \;conjunctions)} $$


(form <- with(raj, formality(dialogue, act)))
(form <- with(raj, formality(rajPOS, act)))

Formality Plot

plot(form, bar.colors=c("Set2", "RdBu"))


(poldat <- with(mraja1spl, polarity(dialogue, list(sex, fam.aff, died))))


Qualitative Coding System

Apply codes to:

codes <- qcv(AA, BB, CC)
X <- cm_df.transcript(DATA$state, DATA$person, file="DATA.txt")

     1        2  3    4   5   6   
     Computer is fun. Not too fun.

     7  8    9    10   11   
     No it's not, it's dumb.

     12   13     14 15 
     What should we do?

     16  17    18 19     
     You liar, it stinks!

Coding time spans works similarly...

x <- list(
    transcript_time_span = qcv(00:00 - 1:12:00),
    A = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 
        12.00:14.00, 00.51.00:00.59.00"),
    B = qcv(terms = "2.40, 3.01:3.02, 5.01, 6.02:7.00,
        9.00, 30.00:39.00, 1.12.00:1.19.01"),
    C = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 17.01")

After Reading Qualitative Codes...

(y <- cm_2long(x))

Gantt Plot of Codes


Summary of Codes


Visualizing Discourse Data

  1. Lexical Dispersion Plot
  2. Word Cloud
  3. Turn of Talk Plot
  4. Venn Diagram
  5. Word Network Plot

Lexical Dispersion Plot

with(rajSPLIT , dispersion_plot(dialogue, c("love", "night"),
    grouping.var = list(fam.aff, sex), rm.vars = act))

Lexical Dispersion Plot

wrds <- word_list(pres_debates2012$dialogue, stopwords = Top200Words)
wrds2 <- spaste(wrds[["rfswl"]][["all"]][, "WORD"])
wrds2 <- c(" governor~~romney ", wrds2[-c(3, 12)])
with(pres_debates2012 , dispersion_plot(dialogue, wrds2, rm.vars = time, color="black", bg.color="white"))

Word Cloud (Colored Terms)

terms <- list(
    I = c("i", "i'm"),
    mal = qcv(stinks, dumb, distrust),
    articles = qcv(the, a, an),
Word Cloud Plot (Colored Terms)

with(DATA, trans_cloud(state, target.words=terms, 
    cloud.colors=qcv(red, green, blue, black, gray65),
    expand.target=FALSE, proportional=TRUE, legend=c(names(terms),
    "other"), max.word.size = 8, min.word.size = .5))

Gradient Cloud

DATA2$state <- space_fill(DATA$state, c("is fun", "too fun", "you liar"))
gradient_cloud(DATA$state, DATA$sex, title="Houghton Colors", 
    max.word.size = 8, min.word.size = .01, X ="purple" , Y = "yellow")

Gradient Cloud

gradient_cloud(DATA2$state, DATA2$sex, title="Gender Word Use", 
    max.word.size = 8, min.word.size = .01, X ="red" , Y = "blue")

Turn of Talk Plot



tot_plot(mraja1, "dialogue", grouping.var = c("sex", "fam.aff"), tot=FALSE, plot=FALSE)+
    scale_fill_brewer(palette="Set1") +
    geom_hline(aes(yintercept=mean(word.count))) +
    geom_hline(aes(yintercept=mean(word.count) + (2 *sd(word.count)))) +
    geom_hline(aes(yintercept=mean(word.count) + (3 *sd(word.count)))) +
    geom_text(parse=TRUE, hjust=0, vjust=0, size = 3, aes(x = 2, 
        y = mean(word.count) + 2, label = "bar(x)")) +
    geom_text(hjust=0, vjust=0, size = 3, aes(x = 1, 
        y = mean(word.count) + (2 *sd(word.count)) + 2, label = "+2 sd")) +
    geom_text(hjust=0, vjust=0,  size = 3, aes(x = 1, 
        y = mean(word.count) + (3 *sd(word.count)) + 2, label = "+3 sd")) +
    ggtitle("Romeo & Juliet:\nAct 1 Turns of Talk")

Venn Diagram

with(DATA , trans_venn(state, person, legend.location = "topright"))

Word Network Plot

word_network_plot(text.var=DATA$state, DATA$person, stopwords=NULL)


  1. How might qdap + R fit into your workflow?
  2. What do you want to know more about?
  3. Are there any points that need to be clarified?

