In amatsuo/kaigiroku: Programmatic Access to the API for Japanese Diet Proceedings

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "README-"
)

kaigiroku: Programmatic Access to the API for Japanese Diet Proceedings

This package is to access the API of Kokkai Kaigiroku, conference minutes of Japanese National Diet. The API documentation is available here (only in Japanese).

Prerequisites

Nothing. You may need to download dependent packages during the installation.

Installation

You can isntall the package from CRAN:

install.packages("kaigiroku")

Alternatively, you can install a developer version by:

devtools::install_github("amatsuo/kaigiroku")

Examples

At the moment, the package provide only the functionality to download conference minutes of specific meetings (e.g. Budget Committee (予算委員会) and Plenary Meeting (本会議)) for a specified period.

Download Conference Minutes

require(kaigiroku)
# the following line will download all speeches at plenary meetings in the 
# Upper House (参議院) from Jan 1 to Jan 31, 2017. 
plenary_speeches <- get_meeting(house = "Upper", startDate = "2017-01-01", 
                                endDate = "2017-01-31", meetingName = "本会議")

# the following line will download all speeches at the budget committee meetings in 
# the lower house during the 185th session.
budgetcom_speeches <- get_meeting(house = "Lower", sessionNumber = 185, 
                                  meetingName = "予算委員会")

# the following line will download all speeches at the budget committee meetings in 
# the Prime Minister's Questions (or Party Leader Debate, "国家基本政策委員会合同審査会")
# qt_speeches are used in the later part of demos
qt_speeches <- get_meeting(house = "Both", sessionNumber = 190, 
                           meetingName = "国家基本政策委員会合同審査会")
head(qt_speeches)

Getting meeting information without downloading speech contents

Sometimes you may want to download meeting information, without getting the speech contents. You can get a list of meeting at specific period, by turning on meeting_list option.

The intended use of this option is to get the list of all meetings. The main API does not allow skipping nameOfMeeting option (or you have to specify other option such as speekerName). If you want to get all speeches in a parliamentary session, there is no way to do that directly. You have to find the list of meeting names before hand, and this meeting_list option will do that.

# the following returns the list of meetings in Jauary 2020. Once you know the names of meetings you can use the names to download speech.
get_meeting(house = "Lower", startDate = "2020-01-01", endDate = "2020-01-31", 
            meeting_list = TRUE)

After getting speeches

Using the power of quanteda (and stringi for boundary split), you can easily work with the texts and run analyses.

First, we need to remove unnecessary text sections unique to Japanese conforence minutes.

require(quanteda)
require(stringi)
require(dplyr)

# delete header
qt_speeches <- qt_speeches[qt_speeches$speaker !="", ]
# delete speeches by the chair of the meeting
qt_speeches <- qt_speeches[grep("^○会長", qt_speeches$speech, invert = TRUE), ]

# revmove non-speech part
qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "^○\\S+\\s+", "")
qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "（.+?）|〔.+?〕", "")
qt_speeches$speech <- stri_replace_all_regex(qt_speeches$speech, "^\\s{2,}.+\n", "")

Now, generate corpus and then tokens object.

# generate quanteda corpus object
data_corpus_qtspeech <- corpus(qt_speeches$speech, 
                               docnames = paste(qt_speeches$speaker, qt_speeches$speechOrder),
                               docvars = qt_speeches[, c(1:5, 8:9)])
summary(data_corpus_qtspeech)

data_tokens_qtspeech <- tokens(data_corpus_qtspeech)

# kwicly look at some key terms
kwic(data_tokens_qtspeech, "政府", window = 4) # government 
kwic(data_tokens_qtspeech, "経済", window = 4) # ecnomy
kwic(data_tokens_qtspeech, "成長", window = 4) # growth

require(readtext)
# generate tokens and then dfm 
data_dfm_qtspeech <- tokens(data_corpus_qtspeech, remove_punct = TRUE) %>% dfm()
#topfeatures(data_dfm_qtspeech, n = 100)

# remove stopwords and punctuations (using slothlib list http://www.dl.kuis.kyoto-u.ac.jp/slothlib/)
# Warning: the word list removes potentially important words
jpstopwords <- readtext("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
jpstopwords <- tokens(jpstopwords[jpstopwords != ""]) %>% as.character
# remove stopwords
data_dfm_qtspeech <- dfm_remove(data_dfm_qtspeech, 
                                valuetype = "fixed",
                                pattern = jpstopwords)

topfeatures(data_dfm_qtspeech, n = 20) # still a lot of unnecessary features

# remove entries only with hiraganas (ひらがな)
data_dfm_qtspeech <- dfm_remove(data_dfm_qtspeech, 
                                pattern = "^[あ-ん]+$",
                                valuetype = "regex")
print(data_dfm_qtspeech)
topfeatures(data_dfm_qtspeech, n = 20) #looks better

quanteda.textplots::textplot_wordcloud(
  data_dfm_qtspeech, min_count = 6, random_order = FALSE,
  rotation = .25, 
  family = "HiraKakuProN-W3", # or other fonts 
  color = RColorBrewer::brewer.pal(8,"Dark2"))

Topicmodeling

We are going to esitmate an LDA topic model. First regenerate dfm at sentence level

data_corpus_qtspeech_sent <- corpus_reshape(data_corpus_qtspeech, to = "sentences")
data_dfm_qtspeech_sent <- data_corpus_qtspeech_sent %>% tokens(remove_punct = TRUE) %>% dfm
data_dfm_qtspeech_sent <- dfm_remove(data_dfm_qtspeech_sent, 
                                     valuetype = "fixed",
                                     pattern = jpstopwords)

# remove entries only with hiraganas (ひらがな)
data_dfm_qtspeech_sent <- dfm_remove(data_dfm_qtspeech_sent, 
                                     pattern = "^[あ-ん]+$",
                                     valuetype = "regex")

Run the model

require(topicmodels)

model_lda_qt_speeches <- LDA(convert(data_dfm_qtspeech_sent, to = "topicmodels"), 
                             k = 6)
get_terms(model_lda_qt_speeches, 10)
# topics(model_lda_qt_speeches, 3)