knitr::opts_chunk$set( fig.width=8, #fig.height=6, collapse = TRUE, comment = "#>" )
library(dbtprotokoll) library(tidyverse) library(xml2) library(ggplot2)
We start by loading a protocol dataset consisting out of previously parsed xml-protocols from disk into our global environment.
load("dataset.RData") #setwd("../") #protocols <- parse_protocols(start = "19001-data.xml", end = "19004-data.xml")
We also remove all entries with a moderator-status because these are not represetative for the average word-usage.
speaches <- filter(protocols[[2]], is.na(moderator))
We divde the speakers into academics and non-academics depending on the occurance of an entry in "titel".
academics <- filter(protocols[[1]], !is.na(titel)) nonacademics <- filter(protocols[[1]], is.na(titel))
Now we want to get the mean word length and distribution for word lengths for each group. Because we will need this once more for our second analysis, we have small functions for that:
The function word_length counts the lengths of words by counting all letters in them.
word_length <- function(str){ words <- str_split(str, "[:space:]") len <- sapply(words, str_count, "[:alpha:]{1}") return(len) }
Because the majority of representatives does not have an academic title we want this distribution in percentages for better comparison.
percentage <- function(tbl){ group_by(tbl, lgt) %>% mutate("count" = n()) %>% distinct() %>% arrange(lgt) %>% ungroup() %>% mutate(pct = count / sum(count)) }
By dividing the speeches into academic and non-academic speeches we can get the word length distribution and the mean word lenght.
academic_speaches <- left_join(academics, mutate(speaches,speaker_id=speaker_id), by = c("id" = "speaker_id")) academic_speaches <- filter(academic_speaches, speech_type == "speech") academic_word_lengths <- tibble("lgt" = unname(unlist(sapply(academic_speaches$content, word_length)))) %>% filter(lgt > 0) academic_mean_word_length <- summarise(academic_word_lengths, "mean" = mean(lgt)) nonacademic_speaches <- left_join(nonacademics, mutate(speaches,speaker_id=speaker_id), by = c("id" = "speaker_id")) nonacademic_speaches <- filter(nonacademic_speaches, speech_type == "speech") nonacademic_word_lengths <- tibble("lgt" = unname(unlist(sapply(nonacademic_speaches$content, word_length)))) %>% filter(lgt > 0) nonacademic_mean_word_length <- summarise(nonacademic_word_lengths, "mean" = mean(lgt))
Finally we can put this data into graphs.
titel_mean_word_length <- tibble("titel" = c("academics", "nonacademics"), "count" = c(academic_mean_word_length$mean, nonacademic_mean_word_length$mean)) titel_word_lengths <- bind_rows(mutate(percentage(academic_word_lengths), "titel" = "academics"), mutate(percentage(nonacademic_word_lengths), "titel" = "nonacademics")) ggplot(titel_mean_word_length) + geom_col(aes(x = titel, y = count)) ggplot(titel_word_lengths) + geom_col(aes(x = lgt, y = pct, fill = titel), position = "dodge")+ xlim(0,20)
Similar to before we have a function to get the distribution of word lengths for speakers of different parties.
party_word_length <- function(speaches, speakers, party){ party_members <- filter(speakers, fraktion == party) party_speeches <- left_join(party_members, speaches, by = c("id" = "speaker_id")) party_speeches <- filter(party_speeches, speech_type == "speech") word_lengths <- tibble("lgt" = unname(unlist(sapply(party_speeches$content, word_length)))) %>% filter(lgt > 0) return(word_lengths) }
We do the analysis for all 6 parties and compare them.
parties <- c("cdu/csu", "spd", "fdp", "bündnis90/diegrünen", "afd", "dielinke") party_word_lengths <- lapply(parties, party_word_length, speakers = protocols[[1]], speaches = speaches) party_mean_word_length <- lapply(party_word_lengths, summarise, "mean" = mean(lgt)) party_mean_word_length <- tibble("party" = parties, "count" = sapply(party_mean_word_length, function(tbl){return(tbl$mean)})) party_word_lengths_pct <- bind_rows(lapply(1:6, function(i){ return(mutate(percentage(party_word_lengths[[i]]), "party" = parties[i]))})) ggplot(party_mean_word_length) + geom_col(aes(x = party, y = count, fill = party)) + scale_fill_manual(values=party_colors) ggplot(party_word_lengths_pct) + geom_col(aes(x = lgt, y = pct, fill = party), position = "dodge") + xlim(0,20) + scale_fill_manual(values=party_colors)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.