knitr::opts_chunk$set( collapse = FALSE, comment = "##" )
library(quanteda)
Download corpus constructed from Report on the Work of the Government published by Premier of the State Council between 1954 and 2017. You can download the corpus using the quanteda.corpora package.
# read text files devtools::install_github("quanteda/quanteda.corpora")
library(quanteda.corpora) corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")
# Chinese stopwords ch_stop <- stopwords("zh", source = "misc") # tokenize ch_toks <- tokens(corp, remove_punct = TRUE) %>% tokens_remove(ch_stop) # construct a dfm ch_dfm <- dfm(ch_toks) topfeatures(ch_dfm)
# plot a word cloud set.seed(100) # to set the font correctly for macOS textplot_wordcloud(ch_dfm, min_count = 500, random_order = FALSE, rotation = .25, max_words = 100, min_size = 0.5, max_size = 2.8, font = if (Sys.info()['sysname'] == "Darwin") "SimHei" else NULL, color = RColorBrewer::brewer.pal(8,"Dark2"))
# fcm within the window size of 5 ch17_corp <- corpus_subset(corp, Year == "2017") ch17_toks <- tokens(ch17_corp, remove_punct = TRUE) %>% tokens_remove(ch_stop) ch_fcm <- fcm(ch17_toks, context = "window") topfeatures(ch_fcm["改革", ])
library("quanteda.textmodels") wf <- textmodel_wordfish(ch_dfm) y <- 1954:2017 y <- y[y <= 1964 | y >= 1975] y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)] plot(y, wf$theta, xlab = "Year", ylab = "Position")
# bigrams cross the whole dataset ch_col <- textstat_collocations(ch_toks, size = 2, min_count = 20) knitr::kable(head(ch_col, 10)) # bigrams in 2017 report ch17_col <- textstat_collocations(ch17_toks, size = 2) knitr::kable(head(ch17_col, 10))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.