In quanteda/quanteda: Quantitative Analysis of Textual Data

knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "##"
)

library("quanteda")

Download corpus

Download corpus constructed from Report on the Work of the Government published by Premier of the State Council between 1954 and 2017. You can download the corpus using the quanteda.corpora package.

# read text files
remotes::install_github("quanteda/quanteda.corpora")

library("quanteda.corpora")
corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")

Tokenization

# Chinese stopwords
ch_stop <- stopwords("zh", source = "misc")

# tokenize
toks_ch <- corp |> 
    tokens(remove_punct = TRUE) |>
    tokens_remove(pattern = ch_stop)

# Construct a dfm
dfmat_ch <- dfm(toks_ch)

# Get most frequent features
topfeatures(dfmat_ch)

Analysis

Word cloud

# Plot a word cloud
set.seed(100)

# to set the font correctly for macOS
library("quanteda.textplots")
textplot_wordcloud(dfmat_ch, min_count = 500, random_order = FALSE,
                   rotation = .25, max_words = 100,
                   min_size = 0.5, max_size = 2.8,
                   font = if (Sys.info()['sysname'] == "Darwin") "SimHei" else NULL,
                   color = RColorBrewer::brewer.pal(8, "Dark2"))

Feature co-occurrence matrix

# fcm within the window size of 5
corp_ch17 <- corpus_subset(corp, Year == "2017")

toks_ch17 <- corp_ch17 |> 
    tokens(remove_punct = TRUE) |> 
    tokens_remove(ch_stop)

fcmat_ch <- corp_ch17 |> 
    tokens() |> 
    fcm(context = "window", window = 5)

Unsupervised document scaling

In this example, we run a Wordfish model to show how to apply an unspervised document scaling method to Chinese texts.

library("quanteda.textmodels")

tmod_wf <- textmodel_wordfish(dfmat_ch)

y <- 1954:2017
y <- y[y <= 1964 | y >= 1975]
y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)]
plot(y, tmod_wf$theta, xlab = "Year", ylab = "Position")

Collocations

# bigrams cross the whole dataset
library("quanteda.textstats")

tstat_col_ch <- textstat_collocations(toks_ch, size = 2, min_count = 20)
knitr::kable(head(tstat_col_ch, 10))

# bigrams in 2017 report
tstat_col_ch17 <- textstat_collocations(toks_ch17, size = 2)
knitr::kable(head(tstat_col_ch17, 10))