knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "README-" )
This package is to access the API of Kokkai Kaigiroku, conference minutes of Japanese National Diet. The API documentation is available here (only in Japanese).
Nothing. You may need to download dependent packages during the installation.
You can isntall the package from CRAN:
install.packages("kaigiroku")
Alternatively, you can install a developer version by:
devtools::install_github("amatsuo/kaigiroku")
At the moment, the package provide only the functionality to download conference minutes of specific meetings (e.g. Budget Committee (予算委員会) and Plenary Meeting (本会議)) for a specified period.
require(kaigiroku) # the following line will download all speeches at plenary meetings in the # Upper House (参議院) from Jan 1 to Jan 31, 2017. plenary_speeches <- get_meeting(house = "Upper", startDate = "2017-01-01", endDate = "2017-01-31", meetingName = "本会議") # the following line will download all speeches at the budget committee meetings in # the lower house during the 185th session. budgetcom_speeches <- get_meeting(house = "Lower", sessionNumber = 185, meetingName = "予算委員会") # the following line will download all speeches at the budget committee meetings in # the Prime Minister's Questions (or Party Leader Debate, "国家基本政策委員会合同審査会") # qt_speeches are used in the later part of demos qt_speeches <- get_meeting(house = "Both", sessionNumber = 190, meetingName = "国家基本政策委員会合同審査会") head(qt_speeches)
Sometimes you may want to download meeting information, without getting the speech contents. You can get a list of meeting at specific period, by turning on meeting_list
option.
The intended use of this option is to get the list of all meetings. The main API does not allow skipping nameOfMeeting
option (or you have to specify other option such as speekerName
). If you want to get all speeches in a parliamentary session, there is no way to do that directly. You have to find the list of meeting names before hand, and this meeting_list
option will do that.
# the following returns the list of meetings in Jauary 2020. Once you know the names of meetings you can use the names to download speech. get_meeting(house = "Lower", startDate = "2020-01-01", endDate = "2020-01-31", meeting_list = TRUE)
Using the power of quanteda
(and stringi
for boundary split), you can easily work with the texts and run analyses.
First, we need to remove unnecessary text sections unique to Japanese conforence minutes.
require(quanteda) require(stringi) require(dplyr) # delete header qt_speeches <- qt_speeches[qt_speeches$speaker !="", ] # delete speeches by the chair of the meeting qt_speeches <- qt_speeches[grep("^○会長", qt_speeches$speech, invert = TRUE), ] # revmove non-speech part qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "^○\\S+\\s+", "") qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "(.+?)|〔.+?〕", "") qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "^\\s{2,}.+\n", "")
Now, generate corpus and then tokens object.
# generate quanteda corpus object data_corpus_qtspeech <- corpus(qt_speeches$speech, docnames = paste(qt_speeches$speaker, qt_speeches$speechOrder), docvars = qt_speeches[, c(1:5, 8:9)]) summary(data_corpus_qtspeech) data_tokens_qtspeech <- tokens(data_corpus_qtspeech) # kwicly look at some key terms kwic(data_tokens_qtspeech, "政府", window = 4) # government kwic(data_tokens_qtspeech, "経済", window = 4) # ecnomy kwic(data_tokens_qtspeech, "成長", window = 4) # growth
require(readtext) # generate tokens and then dfm data_dfm_qtspeech <- tokens(data_corpus_qtspeech, remove_punct = TRUE) %>% dfm() #topfeatures(data_dfm_qtspeech, n = 100) # remove stopwords and punctuations (using slothlib list http://www.dl.kuis.kyoto-u.ac.jp/slothlib/) # Warning: the word list removes potentially important words jpstopwords <- readtext("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt") jpstopwords <- tokens(jpstopwords[jpstopwords != ""]) %>% as.character # remove stopwords data_dfm_qtspeech <- dfm_remove(data_dfm_qtspeech, valuetype = "fixed", pattern = jpstopwords) topfeatures(data_dfm_qtspeech, n = 20) # still a lot of unnecessary features # remove entries only with hiraganas (ひらがな) data_dfm_qtspeech <- dfm_remove(data_dfm_qtspeech, pattern = "^[あ-ん]+$", valuetype = "regex") print(data_dfm_qtspeech) topfeatures(data_dfm_qtspeech, n = 20) #looks better
quanteda.textplots::textplot_wordcloud( data_dfm_qtspeech, min_count = 6, random_order = FALSE, rotation = .25, family = "HiraKakuProN-W3", # or other fonts color = RColorBrewer::brewer.pal(8,"Dark2"))
We are going to esitmate an LDA topic model. First regenerate dfm at sentence level
data_corpus_qtspeech_sent <- corpus_reshape(data_corpus_qtspeech, to = "sentences") data_dfm_qtspeech_sent <- data_corpus_qtspeech_sent %>% tokens(remove_punct = TRUE) %>% dfm data_dfm_qtspeech_sent <- dfm_remove(data_dfm_qtspeech_sent, valuetype = "fixed", pattern = jpstopwords) # remove entries only with hiraganas (ひらがな) data_dfm_qtspeech_sent <- dfm_remove(data_dfm_qtspeech_sent, pattern = "^[あ-ん]+$", valuetype = "regex")
Run the model
require(topicmodels) model_lda_qt_speeches <- LDA(convert(data_dfm_qtspeech_sent, to = "topicmodels"), k = 6) get_terms(model_lda_qt_speeches, 10) # topics(model_lda_qt_speeches, 3)
I welcome your comments and feedback. Please file issues on the issues page, and/or send me comments at matsuoakitaka@gmail.com.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.