knitr::opts_chunk$set( collapse = FALSE, comment = "##" )
library("quanteda")
Download corpus constructed from Report on the Work of the Government published by Premier of the State Council between 1954 and 2017. You can download the corpus using the quanteda.corpora package.
# read text files remotes::install_github("quanteda/quanteda.corpora")
library("quanteda.corpora") corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")
# Chinese stopwords ch_stop <- stopwords("zh", source = "misc") # tokenize toks_ch <- corp |> tokens(remove_punct = TRUE) |> tokens_remove(pattern = ch_stop) # Construct a dfm dfmat_ch <- dfm(toks_ch) # Get most frequent features topfeatures(dfmat_ch)
# Plot a word cloud set.seed(100) # to set the font correctly for macOS library("quanteda.textplots") textplot_wordcloud(dfmat_ch, min_count = 500, random_order = FALSE, rotation = .25, max_words = 100, min_size = 0.5, max_size = 2.8, font = if (Sys.info()['sysname'] == "Darwin") "SimHei" else NULL, color = RColorBrewer::brewer.pal(8, "Dark2"))
# fcm within the window size of 5 corp_ch17 <- corpus_subset(corp, Year == "2017") toks_ch17 <- corp_ch17 |> tokens(remove_punct = TRUE) |> tokens_remove(ch_stop) fcmat_ch <- corp_ch17 |> tokens() |> fcm(context = "window", window = 5)
In this example, we run a Wordfish model to show how to apply an unspervised document scaling method to Chinese texts.
library("quanteda.textmodels") tmod_wf <- textmodel_wordfish(dfmat_ch) y <- 1954:2017 y <- y[y <= 1964 | y >= 1975] y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)] plot(y, tmod_wf$theta, xlab = "Year", ylab = "Position")
# bigrams cross the whole dataset library("quanteda.textstats") tstat_col_ch <- textstat_collocations(toks_ch, size = 2, min_count = 20) knitr::kable(head(tstat_col_ch, 10)) # bigrams in 2017 report tstat_col_ch17 <- textstat_collocations(toks_ch17, size = 2) knitr::kable(head(tstat_col_ch17, 10))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.