In petermeissner/idep: Helper functions and procedures for the IDEP Project

knitr::opts_chunk$set(echo = TRUE)
Sys.setlocale(category = "LC_ALL", locale = "English")

library(pander)
panderOptions("digits", 2)
library(memisc)
library(idep)
library(stringr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(gridExtra)

load("Z:/Gesch\xe4ftsordnungen/Database/aggregats/isor.Rdata")
isor$t_date <- as.POSIXct(isor$t_date) + 3600 * as.integer(str_extract(isor$t_id, "\\d$"))
isor <- 
  isor  %>% 
  filter(t_date >= as.POSIXct("1945-01-01")) 
isor <- isor[, -grep("_corp_\\d|_corp_mdf_\\d|_corp_del_\\d|_corp_ins_\\d", names(isor))]

theme_set(theme_bw())
theme_stripped <- 
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y=element_text(angle=90, vjust=0.5, hjust=0.5),
    axis.text=element_text(size=6)
  ) 
xlim_time  <- coord_cartesian( xlim = c(
                  as.POSIXct("1944-12-31"),              
                  as.POSIXct("2011-08-01")) ) 
mega_scale <- scale_fill_manual(
       values=c("wds_ins"="#498000A0", "wds_mdf"="#E7A350", "wds_del"="#A81606A0",
                "wds_corp_top_66"= "#F7FBFF", 
                "wds_corp_top_1" = "#6baed6",
                "wds_corp_top_2" = "#9ecae1",  
                "wds_corp_top_3" = "#deebf7",
                "wds_corp_top_4" = "#08519c", 
                "wds_corp_top_5" = "#4292c6",
                "misc."          = "#F7FBFF00",
                "laws"           = "#6baed6", 
                "special"          = "#9ecae1",
                "elections"          = "#deebf7", 
                "control"        = "#08519c",
                "pulbic"         = "#4292c6",
                "ins"="#498000A0", "mdf"="#E7A350", "del"="#A81606A0"
                )
     )

General Overview

n_countries <- length(unique(substring(isor$ctr, 1,3)))
ctr         <- unique(isor$ctr)
countries   <- unique(isor$country)
min_date    <- format(min(isor$t_date, na.rm=TRUE), "%B %d, %Y")
max_date    <- format(max(isor$t_date, na.rm=TRUE), "%B %d, %Y")

The ISOR dataset entails r n_countries countries, namely: r paste(countries[-length(countries)], sep=", ") and r countries[length(countries)]. Switzerland has two parliamentary Standing Orders and therefore is included twice. The timeframe under consideration in the dataset ranges from Standing Orders passed at r min_date up to ones passed at r max_date.

All in all the corpus encompasses r format(sum(isor$wds_raw_rel), big.mark=",") words of relevant content structured in r format(sum(isor$lns_rel), big.mark=",") sub-paragraphs and r dim(isor)[1] documents - that is 7 times the first five books of G.R.R. Martin's A Song of Ice and Fire. r format(sum(isor$lns_chg, na.rm=TRUE), big.mark=",") times a sub-paragraph had been changed within our sample amounting to r format(sum(isor$wds_chg, na.rm=TRUE), big.mark=",") changed words - or 1.5 times J.R.R. Tolkien's Lord of the Rings.

df <- data.frame("country" = countries)
df$`N SO` <- 0
df$`min y` <- "1000-01-01"
df$`max y` <- "1000-01-01"

for (i in seq_along(unique(isor$country)) ) {
  df$`min y`[i] <- format(min(isor$t_date[isor$country == countries[i]]), "%Y")
  df$`max y`[i] <- format(max(isor$t_date[isor$country == countries[i]]), "%Y")
  df$`N SO`[i]  <- sum(isor$country == countries[i])
  df$`chg`[i]   <- sum( isor$wds_chg[isor$country == countries[i]], na.rm=TRUE)
  df$`min w`[i] <- min(isor$wds_clean_rel[isor$country == countries[i]], na.rm=TRUE)
  df$`max w`[i] <- max(isor$wds_clean_rel[isor$country == countries[i]], na.rm=TRUE)
  df$wdns[i]    <- max(isor$wdns[isor$country == countries[i]], na.rm=TRUE)
}

df$`persist`  <- 
  round(  ((as.numeric(df$`max y`) - as.numeric(df$`min y`)) / df$`N SO`) ,2)
df$`chg per y` <- 
  round(as.numeric(str_replace(df$chg, ",", "")) / (as.numeric(df$`max y`) - as.numeric(df$`min y`)))

# df <- df[order(-df$`max w`),]
rownames(df) <- df$country

knitr::kable(df[,-1])

The above table gives an general overview across all countries within the dataset. Both the differences in length (measured as relevant words, column: 'min w' and 'max w') as well as the differences in the amount of change (measured as relevant words changed, column: 'chg') are striking. The British SO are more than four times as long as e.g. the Norwegian and most change within the dataset happened in Great Britan amounting to nearly 80000 words changed in over 60 years compared to Denmark with only approximatly 25000 words changed in the same time period. Nonetheless, all SO are changed considerably. While an SO usually is changed more than once per legislative term (column: 'ys per SO') Austria and the UK manage to set themselves apart by having SO changes only occuring once every 5 years, respectively by having more than two SO per year.

rankplot(df$`N SO` , df$country, "Number of SO", size=1.7) + ylim(0,150)
rankplot(df$chg, df$country, "changed words", size=1.7) + ylim(0,80000)
rankplot(df$persist, df$country, "SO persistence in years", size=1.7) + ylim(0,5)
rankplot(df$`chg per y`, df$country, "changed words per year", size=1.7) + ylim(0, 1600)

The graphics above give a brief overview of four basic variables in the data set. They show they show the value as well as the rank for the number of SO in the data set, the amount of changed that occured to those SO, how many years it takes on average until it is replaced by a new version, the average amount of words that are changed per year.

First of all, all countries under consideration do change their SO frequently and to an substantial amount. On average SO are changed every r round(mean(df$persist), 1) years with an change rate of approximatly r round(mean(df$'chg per y')/10)*10 words per year. Within the variables there is considerable variance as well. UK and Sweden are the most frequent changers with more than one version per year while Austria's SO have an exceptional high lifespan with over 4.5 years - in the UK all matters seem to require a change of SO even the creation or disolving of a single committee while in Austria changing the SO requires a super majority. In regard to the amount changed especially Portugal and UK seem to play a special role. In addition to beeing a hyper-frequent changer UK also manages to change much more than other countries. One reason for this is that in UK all rules for the Grand Committees for Northern Ireland. Scotland and Wales are speleld out explicitly and redundantly. Portugal on the other hand had very view changes (13) since 1976 with one reform beeing extremely extensive so that over time the Portugese change rate might wander nearer towards the other countries.

\pagebreak{}

Reforms per country

In the following the the length, topical distribution as well as the amount of change and the type of change are plotted country by country.

  t_size  <- 2 
  legend_data <- 
    data.frame(
      xmin=0.25, xmax=0.5, ymin = seq(0,1, 1/6)[1:6], ymax = seq(0,1, 1/6)[2:7], 
      xmean = 0.375, ymean =  1/12 + seq(0,1, 1/6)[1:6], 
      label=as.factor(c("misc","laws", "special","elections", "control","pulbic")),
      fill=c("misc","laws", "special","elections", "control","pulbic")
    )

p_legend <- ggplot()
  p_legend <- # corpus legend
    p_legend +
    geom_rect(
      data = legend_data,
      aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, fill=fill)
    ) +
    geom_text(
      data = legend_data,
      aes(x=legend_data$xmean, 
          y=legend_data$ymean, 
          label=c("misc","laws", "spec.","elec.", "cntrl","public")),
      size=t_size, hjust=0.5, vjust=0.5, angle=90
    )

  legend_data2 <- 
    data.frame(
      xmin=0.55, xmax=0.8, ymin = seq(0,1, 1/3)[1:3], ymax = seq(0,1, 1/3)[2:4], 
      xmean = 0.675, ymean =  1/6 + seq(0,1, 1/3)[1:3], 
      label=as.factor(c("ins","mdf", "del")),  fill=c("ins","mdf", "del")
    )

  p_legend <- 
    p_legend +
    geom_rect(
      data = legend_data2,
      aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, fill=fill)
    ) +
    geom_text(
      data = legend_data2,
      aes(x=legend_data2$xmean, 
          y=legend_data2$ymean, 
          label=c("insertion","modification","deletion")
      ),
      size=t_size, hjust=0.5, vjust=0.5, angle=90
    ) + 
    mega_scale +  
    theme(
      legend.position="none",
      axis.title   = element_blank(),
      axis.text    = element_blank(),
      axis.ticks   = element_blank(),
      axis.line    = element_blank(),
      panel.border = element_blank(),
      panel.margin = unit(0, "lines"),
      plot.margin  = unit(c(0,0,0,0),"cm")
    )

desc <- list()
desc$Austria <- "
Austria is a clears outlier in regard to the frequency with which is makes changes to its SO -- approximatly five years lie between changes. 
"

P <- list()
CTR <- unique(isor$ctr)
COUNTRY <- unique(isor$country)
for ( i in seq_along(CTR)){
  changes <- 
    isor %>% 
    select(t_id, ctr, t_date, wds_mdf, wds_ins, wds_del) %>% 
    filter(ctr==CTR[i]) %>% 
    gather(change, length, wds_ins, wds_mdf, wds_del)
  changes$year <- as.numeric(str_extract(changes$t_id, "\\d+"))
  changes$t_date <- as.POSIXct(paste0(changes$year, "-06-01"))
  changes <- aggregate(changes$length, by=list(changes$t_date, changes$change), sum)
  names(changes) <- c("t_date", "change", "length")
  changes$length[is.na(changes$length)] <- 0


  corpus <- 
    isor %>% 
    select(
      t_id, ctr, t_date, 
      wds_corp_top_66, wds_corp_top_1, wds_corp_top_2, 
      wds_corp_top_3, wds_corp_top_4, wds_corp_top_5
    ) %>% 
    filter(ctr==CTR[i]) %>% 
    gather(
      corpus, length, 
      wds_corp_top_66, wds_corp_top_1, wds_corp_top_2, 
      wds_corp_top_3, wds_corp_top_4, wds_corp_top_5
    ) %>% 
    arrange(t_date, corpus)

  p <- 
    ggplot() + 
    geom_area(
      data=corpus, 
      aes(x=t_date, y=length, group=corpus, fill=corpus)
    ) +
    geom_bar(
      data=changes, 
      aes(x=t_date, y=length, group=change, fill=change), 
      stat = 'identity'
    ) 

  p <- 
    p + 
    annotate( "text", x = as.POSIXct("1947-01-01"), y = Inf, label = COUNTRY[i], hjust=0, vjust=2, size=4, color="#3180AE")



  corpus_label <- 
    corpus[max(corpus$t_date)==corpus$t_date,"length"]
  corpus_label$y <- c(0,cumsum(corpus_label$length)[-dim(corpus_label)[1]]) + 
    corpus_label$length/2 
  corpus_label$x <- as.POSIXct("2012-01-01")
  corpus_label$label  <- c("misc.","laws", "spec.","elec.", "control","pulbic")
  corpus_label$label[ corpus_label$length < 100] <- ""

  p <- 
    p +  mega_scale + 
    guides(fill=FALSE) + 
    theme_stripped +
    xlim_time + #ylim(0,45000) + 
    geom_hline(yintercept=0, color="grey") +
    theme(
      #axis.text.x= element_text(size=5),
      axis.text.y= element_text(angle = 60)
    )

  grid.arrange(p, p_legend, ncol=2, widths=c(16,1))
}

\pagebreak{}

dutiestodo

cat("==========================================================================")
cat("file copied to: \nZ:/Gesch\xe4ftsordnungen/Papiere und Konferenzen/2015_corpus_coding/in_progress/")
file.copy("C:/Dropbox/RPackages/idep/inst/reports/in_progress.pdf",  "Z:/Gesch\xe4ftsordnungen/Papiere und Konferenzen/2015_corpus_coding/in_progress/in_progress.pdf", overwrite=TRUE)