Nothing
## ----echo = FALSE-------------------------------------------------------------
knitr::opts_chunk$set(collapse = FALSE, comment = "##")
## ----eval = FALSE-------------------------------------------------------------
# install.packages("quanteda")
## ----eval = FALSE-------------------------------------------------------------
# remotes::install_github("quanteda/quanteda.corpora")
## ----eval = FALSE-------------------------------------------------------------
# remotes::install_github("kbenoit/quanteda.dictionaries")
## ----message = FALSE----------------------------------------------------------
library("quanteda")
## ----include=FALSE------------------------------------------------------------
quanteda_options(threads = 1)
## -----------------------------------------------------------------------------
corp_uk <- corpus(data_char_ukimmig2010) # build a new corpus from the texts
summary(corp_uk)
## -----------------------------------------------------------------------------
docvars(corp_uk, "Party") <- names(data_char_ukimmig2010)
docvars(corp_uk, "Year") <- 2010
summary(corp_uk)
## ----eval=FALSE---------------------------------------------------------------
# require(readtext)
#
# # Twitter json
# dat_json <- readtext("social_media/zombies/tweets.json")
# corp_twitter <- corpus(dat_json)
# summary(corp_twitter, 5)
#
# # generic json - needs a textfield specifier
# dat_sotu <- readtext("corpora/sotu/sotu.json", text_field = "text")
# summary(corpus(dat_sotu), 5)
#
# # text file
# dat_txtone <- readtext("corpora/project_gutenberg/pg2701.txt")
# summary(corpus(dat_txtone), 5)
#
# # multiple text files
# dat_txtmultiple1 <- readtext("corpora/inaugural/*.txt")
# summary(corpus(dat_txtmultiple1), 5)
#
# # multiple text files with docvars from filenames
# dat_txtmultiple2 <- readtext("corpora/inaugural/*.txt",
# docvarsfrom = "filenames", sep = "-",
# docvarnames = c("Year", "President"))
# summary(corpus(dat_txtmultiple2), 5)
#
# # XML data
# dat_xml <- readtext("xmlData/plant_catalog.xml", text_field = "COMMON")
# summary(corpus(dat_xml), 5)
#
# # csv file
# write.csv(data.frame(inaug_speech = as.character(data_corpus_inaugural),
# docvars(data_corpus_inaugural)),
# file = "/tmp/inaug_texts.csv", row.names = FALSE)
# dat_csv <- readtext("/tmp/inaug_texts.csv", text_field = "inaug_speech")
# summary(corpus(dat_csv), 5)
## -----------------------------------------------------------------------------
print(data_corpus_inaugural)
## -----------------------------------------------------------------------------
as.character(data_corpus_inaugural)[2]
## -----------------------------------------------------------------------------
summary(data_corpus_inaugural, n = 5)
## ----fig.width = 8------------------------------------------------------------
tokeninfo <- summary(data_corpus_inaugural)
tokeninfo$Year <- docvars(data_corpus_inaugural, "Year")
with(tokeninfo, plot(Year, Tokens, type = "b", pch = 19, cex = .7))
## -----------------------------------------------------------------------------
# longest inaugural address: William Henry Harrison
tokeninfo[which.max(tokeninfo$Tokens), ]
## -----------------------------------------------------------------------------
corp1 <- head(data_corpus_inaugural, 2)
corp2 <- tail(data_corpus_inaugural, 2)
corp3 <- corp1 + corp2
summary(corp3)
## -----------------------------------------------------------------------------
summary(corpus_subset(data_corpus_inaugural, Year > 1990))
summary(corpus_subset(data_corpus_inaugural, President == "Adams"))
## -----------------------------------------------------------------------------
data_tokens_inaugural <- tokens(data_corpus_inaugural)
kwic(data_tokens_inaugural, pattern = "terror")
## -----------------------------------------------------------------------------
kwic(data_tokens_inaugural, pattern = "terror", valuetype = "regex")
## -----------------------------------------------------------------------------
kwic(data_tokens_inaugural, pattern = "communist*")
## -----------------------------------------------------------------------------
# show context of the first six occurrences of "United States"
kwic(data_tokens_inaugural, pattern = phrase("United States")) |>
head()
## -----------------------------------------------------------------------------
# inspect the document-level variables
head(docvars(data_corpus_inaugural))
## -----------------------------------------------------------------------------
txt <- c(text1 = "This is $10 in 999 different ways,\n up and down; left and right!",
text2 = "@koheiw7 working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.")
tokens(txt)
tokens(txt, remove_numbers = TRUE, remove_punct = TRUE)
tokens(txt, remove_numbers = FALSE, remove_punct = TRUE)
tokens(txt, remove_numbers = TRUE, remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE, remove_separators = FALSE)
## -----------------------------------------------------------------------------
tokens("Great website: http://textasdata.com?page=123.", what = "character")
tokens("Great website: http://textasdata.com?page=123.", what = "character",
remove_separators = FALSE)
## -----------------------------------------------------------------------------
# sentence level
tokens(c("Kurt Vongeut said; only assholes use semi-colons.",
"Today is Thursday in Canberra: It is yesterday in London.",
"En el caso de que no puedas ir con ellos, ¿quieres ir con nosotros?"),
what = "sentence")
## -----------------------------------------------------------------------------
head(stopwords("en"), 20)
head(stopwords("ru"), 10)
head(stopwords("ar", source = "misc"), 10)
## -----------------------------------------------------------------------------
tokens("New York City is located in the United States.") |>
tokens_compound(pattern = phrase(c("New York City", "United States")))
## -----------------------------------------------------------------------------
tokens("one~two~three") |>
tokens_split(separator = "~")
## -----------------------------------------------------------------------------
corp_inaug_post1990 <- corpus_subset(data_corpus_inaugural, Year > 1990)
# make a dfm
dfmat_inaug_post1990 <- corp_inaug_post1990 |>
tokens() |>
dfm()
print(dfmat_inaug_post1990)
## ----fig.width = 8, fig.height = 8--------------------------------------------
dfmat_uk <- tokens(data_char_ukimmig2010, remove_punct = TRUE) |>
tokens_remove(stopwords("en")) |>
dfm()
dfmat_uk
## -----------------------------------------------------------------------------
# 20 most frequent words
topfeatures(dfmat_uk, 20)
## -----------------------------------------------------------------------------
dfmat_pres <- tail(data_corpus_inaugural, 20) |>
tokens(remove_punct = TRUE) |>
tokens_remove(stopwords("en")) |>
dfm() |>
dfm_group(groups = Party)
## -----------------------------------------------------------------------------
dfm_sort(dfmat_pres)
## -----------------------------------------------------------------------------
corp_inaug_post1991 <- corpus_subset(data_corpus_inaugural, Year > 1991)
## -----------------------------------------------------------------------------
dict <- dictionary(list(terror = c("terrorism", "terrorists", "threat"),
economy = c("jobs", "business", "grow", "work")))
## -----------------------------------------------------------------------------
dfmat_inaug_post1991_dict <- tokens(corp_inaug_post1991) |>
tokens_lookup(dictionary = dict) |>
dfm()
dfmat_inaug_post1991_dict
## ----eval = FALSE-------------------------------------------------------------
# dictliwc <- dictionary(file = "LIWC2001_English.dic", format = "LIWC")
# dfmat_inaug_subset <- tokens(data_corpus_inaugural[52:58]) |>
# dfm() |>
# dfm_lookup(dictionary = dictliwc)
# dfmat_inaug_subset[, 1:10]
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.