quickstart.R
In quanteda: Quantitative Analysis of Textual Data

## ----echo = FALSE-------------------------------------------------------------
knitr::opts_chunk$set(collapse = FALSE, comment = "##")

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("quanteda")

## ----eval = FALSE-------------------------------------------------------------
#  remotes::install_github("quanteda/quanteda.corpora")

## ----eval = FALSE-------------------------------------------------------------
#  remotes::install_github("kbenoit/quanteda.dictionaries")

## ----message = FALSE----------------------------------------------------------
library("quanteda")

## ----include=FALSE------------------------------------------------------------
quanteda_options(threads = 1)

## -----------------------------------------------------------------------------
corp_uk <- corpus(data_char_ukimmig2010)  # build a new corpus from the texts
summary(corp_uk)

## -----------------------------------------------------------------------------
docvars(corp_uk, "Party") <- names(data_char_ukimmig2010)
docvars(corp_uk, "Year") <- 2010
summary(corp_uk)

## ----eval=FALSE---------------------------------------------------------------
#  require(readtext)
#  
#  # Twitter json
#  dat_json <- readtext("social_media/zombies/tweets.json")
#  corp_twitter <- corpus(dat_json)
#  summary(corp_twitter, 5)
#  
#  # generic json - needs a textfield specifier
#  dat_sotu <- readtext("corpora/sotu/sotu.json", text_field = "text")
#  summary(corpus(dat_sotu), 5)
#  
#  # text file
#  dat_txtone <- readtext("corpora/project_gutenberg/pg2701.txt")
#  summary(corpus(dat_txtone), 5)
#  
#  # multiple text files
#  dat_txtmultiple1 <- readtext("corpora/inaugural/*.txt")
#  summary(corpus(dat_txtmultiple1), 5)
#  
#  # multiple text files with docvars from filenames
#  dat_txtmultiple2 <- readtext("corpora/inaugural/*.txt",
#                               docvarsfrom = "filenames", sep = "-",
#                               docvarnames = c("Year", "President"))
#  summary(corpus(dat_txtmultiple2), 5)
#  
#  # XML data
#  dat_xml <- readtext("xmlData/plant_catalog.xml", text_field = "COMMON")
#  summary(corpus(dat_xml), 5)
#  
#  # csv file
#  write.csv(data.frame(inaug_speech = as.character(data_corpus_inaugural),
#                       docvars(data_corpus_inaugural)),
#            file = "/tmp/inaug_texts.csv", row.names = FALSE)
#  dat_csv <- readtext("/tmp/inaug_texts.csv", text_field = "inaug_speech")
#  summary(corpus(dat_csv), 5)

## -----------------------------------------------------------------------------
print(data_corpus_inaugural)

## -----------------------------------------------------------------------------
as.character(data_corpus_inaugural)[2]

## -----------------------------------------------------------------------------
summary(data_corpus_inaugural, n = 5)

## ----fig.width = 8------------------------------------------------------------
tokeninfo <- summary(data_corpus_inaugural)
tokeninfo$Year <- docvars(data_corpus_inaugural, "Year")
with(tokeninfo, plot(Year, Tokens, type = "b", pch = 19, cex = .7))

## -----------------------------------------------------------------------------
# longest inaugural address: William Henry Harrison
tokeninfo[which.max(tokeninfo$Tokens), ]

## -----------------------------------------------------------------------------
corp1 <- head(data_corpus_inaugural, 2)
corp2 <- tail(data_corpus_inaugural, 2)
corp3 <- corp1 + corp2
summary(corp3)

## -----------------------------------------------------------------------------
summary(corpus_subset(data_corpus_inaugural, Year > 1990))
summary(corpus_subset(data_corpus_inaugural, President == "Adams"))

## -----------------------------------------------------------------------------
data_tokens_inaugural <- tokens(data_corpus_inaugural)
kwic(data_tokens_inaugural, pattern = "terror")

## -----------------------------------------------------------------------------
kwic(data_tokens_inaugural, pattern = "terror", valuetype = "regex")

## -----------------------------------------------------------------------------
kwic(data_tokens_inaugural, pattern = "communist*")

## -----------------------------------------------------------------------------
# show context of the first six occurrences of "United States"
kwic(data_tokens_inaugural, pattern = phrase("United States")) |>
    head()

## -----------------------------------------------------------------------------
# inspect the document-level variables
head(docvars(data_corpus_inaugural))

## -----------------------------------------------------------------------------
txt <- c(text1 = "This is $10 in 999 different ways,\n up and down; left and right!",
         text2 = "@koheiw7 working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.")
tokens(txt)
tokens(txt, remove_numbers = TRUE,  remove_punct = TRUE)
tokens(txt, remove_numbers = FALSE, remove_punct = TRUE)
tokens(txt, remove_numbers = TRUE,  remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE, remove_separators = FALSE)

## -----------------------------------------------------------------------------
tokens("Great website: http://textasdata.com?page=123.", what = "character")
tokens("Great website: http://textasdata.com?page=123.", what = "character",
         remove_separators = FALSE)

## -----------------------------------------------------------------------------
# sentence level       
tokens(c("Kurt Vongeut said; only assholes use semi-colons.",
         "Today is Thursday in Canberra:  It is yesterday in London.",
         "En el caso de que no puedas ir con ellos, ¿quieres ir con nosotros?"),
          what = "sentence")

## -----------------------------------------------------------------------------
head(stopwords("en"), 20)
head(stopwords("ru"), 10)
head(stopwords("ar", source = "misc"), 10)

## -----------------------------------------------------------------------------
tokens("New York City is located in the United States.") |>
    tokens_compound(pattern = phrase(c("New York City", "United States")))

## -----------------------------------------------------------------------------
tokens("one~two~three") |>
    tokens_split(separator = "~")

## -----------------------------------------------------------------------------
corp_inaug_post1990 <- corpus_subset(data_corpus_inaugural, Year > 1990)

# make a dfm
dfmat_inaug_post1990 <- corp_inaug_post1990 |>
    tokens() |>
    dfm()
print(dfmat_inaug_post1990)

## ----fig.width = 8, fig.height = 8--------------------------------------------
dfmat_uk <- tokens(data_char_ukimmig2010, remove_punct = TRUE) |>
  tokens_remove(stopwords("en")) |>
  dfm()
dfmat_uk

## -----------------------------------------------------------------------------
# 20 most frequent words
topfeatures(dfmat_uk, 20)

## -----------------------------------------------------------------------------
dfmat_pres <- tail(data_corpus_inaugural, 20) |>
  tokens(remove_punct = TRUE) |>
  tokens_remove(stopwords("en")) |>
  dfm() |>
  dfm_group(groups = Party)

## -----------------------------------------------------------------------------
dfm_sort(dfmat_pres)

## -----------------------------------------------------------------------------
corp_inaug_post1991 <- corpus_subset(data_corpus_inaugural, Year > 1991)

## -----------------------------------------------------------------------------
dict <- dictionary(list(terror = c("terrorism", "terrorists", "threat"),
                        economy = c("jobs", "business", "grow", "work")))

## -----------------------------------------------------------------------------
dfmat_inaug_post1991_dict <- tokens(corp_inaug_post1991) |>
  tokens_lookup(dictionary = dict) |>
  dfm()
dfmat_inaug_post1991_dict

## ----eval = FALSE-------------------------------------------------------------
#  dictliwc <- dictionary(file = "LIWC2001_English.dic", format = "LIWC")
#  dfmat_inaug_subset <- tokens(data_corpus_inaugural[52:58]) |>
#                        dfm() |>
#                        dfm_lookup(dictionary = dictliwc)
#  dfmat_inaug_subset[, 1:10]