In xmarquez/vdem: Contains the Varieties of Democracy (V-Dem) Dataset v. 11.1

knitr::opts_chunk$set(echo = FALSE,
                      message = TRUE,
                      warning = FALSE,
                      cache = TRUE,
                      cache.rebuild = TRUE)

library(tidyverse)
path <- "./Country_Year_V-Dem_Full+others_R_v11.1/"

Vdem data

First we read the VDem data, version 11.1, plus the external variables, and run it through country_year_coder. There are some problems with the historical country names.

VDem_plus <- read_rds(paste0(path, "V-Dem-CY-Full+Others-v11.1.rds")) %>%
  as_tibble()

VDem_plus <- VDem_plus %>%
  democracyData::country_year_coder(country_name, 
                                    year,
                                    include_in_output =
                                      c("extended_country_name", 
                                        "GWn", "cown",
                                        "GW_startdate", 
                                        "GW_enddate", 
                                        "GWc", 
                                        "extended_region",
                                        "extended_continent",
                                        "microstate", "lat", 
                                        "lon", "in_GW_system"),
                                    match_type = "country", verbose = TRUE)

VDem_plus <- VDem_plus %>%
  filter(!(country_name == "Serbia" & year %in% c(1804:1829) & GW_startdate == lubridate::ymd("2006-06-05")),
         !(country_name == "Serbia" & year %in% c(1804:1829) & extended_country_name == "Yugoslavia"))

Here's a description of all the variables.

library(skimr)

skim(VDem_plus) %>%
  knitr::kable()

And here are some tests of some typically problematic countries.

VDem_plus <- VDem_plus %>%
  select(country_name:year, extended_country_name, GWn, everything()) %>%
  rename(vdem_country_name = country_name,
         vdem_country_id = country_id,
         vdem_country_text_id = country_text_id,
         vdem_cown = COWcode)

VDem_plus %>% 
  filter(cown %in% c(255, 260, 265) | vdem_cown %in% c(255, 260, 265) ) %>% 
  group_by(vdem_country_name, GWn, cown, vdem_cown) %>% 
  summarise(min(year), max(year)) %>%
  knitr::kable()

VDem_plus %>% 
  filter(cown %in% 815:818 | vdem_cown %in% 815:818 ) %>% 
  group_by(vdem_country_name, GWn, cown, vdem_cown, extended_country_name) %>% 
  summarise(min(year), max(year)) %>%
  knitr::kable()

VDem_plus %>% 
  filter(cown %in% 342:347 | vdem_cown %in% 342:347 ) %>% 
  group_by(vdem_country_name, GWn, cown, vdem_cown, extended_country_name) %>% 
  summarise(min(year), max(year)) %>%
  knitr::kable()

VDem_plus %>% 
  group_by(vdem_country_name, GWn, cown, vdem_cown, extended_country_name) %>% 
  summarise(min(year), max(year)) %>%
  group_by(GWn) %>%
  filter(n() > 1) %>%
  knitr::kable()

usethis::use_data(VDem_plus, overwrite = TRUE)

VDem codebook

Then we read the codebook and process it:

library(pdftools)

vdem_codebook_raw <- pdf_text(paste0(path,"V-Dem Codebook v11.1.pdf"))

We extract the main numbers from the TOC.

toc <- vdem_codebook_raw[5:25]

headers_from_toc <-  toc %>%
  str_extract_all(regex("[0-9\\.]+.+(\\([A-Z\\*\\(\\)]{1,2}\\))( )?(\\([A-Z\\*]{1,2}\\))?( )?(\\([A-Z\\*]{1,2}\\))?[\\s\\r\\n]*\\([\\w\\*, \\r\\n/]{2,}\\)", multiline = TRUE)) %>%
  unlist() %>%
  str_trim() %>%
  str_replace_all("\\r\\n","") %>%
  str_replace_all("[ ]{2,}"," ") %>%
  str_replace("\\(A\\(C\\)\\)", "(A)(C)")

# headers_from_toc

number <- str_extract(headers_from_toc, "^[0-9]+(\\.).+\\(([ve])") 

number <- str_extract(number, "^[0-9\\.]+")
# Tests

check_headers <- readxl::read_excel("vdem_headers.xlsx", col_names = FALSE)

check_number <- str_extract(check_headers$...1, "^[0-9]+(\\.)[0-9]+(\\.)?[0-9]*") 

# check_number[!(check_number %in% number)]
# 
# number[!(number %in% check_number)]

And then we extract the variable names and labels, and remove the page numbers from the extracted pages. Here's an example (page 40), to check that page numbers have been removed:

type_var <- str_extract(headers_from_toc, 
                        "(\\([A-Z\\*]{1,2}\\))( )?(\\([A-Z\\*]{1,2}\\))?( )?(\\([A-Z\\*]{1,2}\\))?") %>%
  str_trim()

# which(is.na(type_var))

headers_from_toc <- headers_from_toc[which(!is.na(type_var))]

type_var <- na.omit(type_var)

labels <- str_replace(headers_from_toc, "^[0-9\\.]+", "") %>% 
  str_extract("[[:print:]-[\\(\\)] ]+(?=\\()") %>%
  str_trim

# which(is.na(labels) | labels == "")

var_names <- str_extract(headers_from_toc,"\\([\\w\\*/, ]+\\)$") %>%
  str_replace_all("\\(", "") %>%
  str_replace_all("\\)", "") %>%
  str_replace_all("([\\w]+)(,)?( )?\\*_osp", "\\1, \\1_osp") %>%
  str_replace_all("([\\w]+)_osp, \\*_ord", "\\1_osp, \\1_ord") %>%
  str_replace_all("([\\w]+)_3C /_4C", "\\1_3C, \\1_4C") %>%
  str_replace_all("([\\w]+)_4C /_5C", "\\1_4C, \\1_5C")

## Get rid of page prematter and page numbers

vdem_codebook_raw_nopage <- vdem_codebook_raw[c(42:418)]

vdem_codebook_raw_nopage <- vdem_codebook_raw_nopage %>% 
  str_replace(paste0("\\r\\nTOC[ ]+", as.character(1:length(vdem_codebook_raw_nopage) + 40), "\\r\\n$"), "")


knitr::kable(vdem_codebook_raw_nopage[1], col.names = "Page 41")

knitr::kable(vdem_codebook_raw_nopage[418-41], col.names = "Page 418")

We then collapse the codebook into a single string, and split it along variable numbers.

collapsed_codebook <- paste(vdem_codebook_raw_nopage, collapse = "\r\n")

number_pattern <- str_replace_all(number, "\\.", "\\\\.")

split_vdem <- str_split(collapsed_codebook,
                              regex(paste0("\\r\\n",number_pattern,collapse = "|")),
                        simplify = TRUE)[1,]

# split_vdem <- str_split(collapsed_codebook,
#                         regex(paste0(str_c(number, ".+", labels), collapse = "|"), 
#                               ignore_case = TRUE), simplify = TRUE)[1,] 

split_vdem <- split_vdem[-1]

split_vdem <- str_squish(split_vdem)

missed <- split_vdem[str_detect(split_vdem, regex(paste0(number_pattern, collapse = "|")))]

project_managers <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>%
  str_extract("Project (M|m)anager(\\(s\\))?:[\\s\\w\\d[:punct:]]+") %>%
  str_replace("Project (M|m)anager(\\(s\\))?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace("(\\.)?Compiler.+|(\\.)?Additional versions.+|(\\.)?Available versions.+|(\\.)?Question.+|(\\.)?Clarification.+|(\\.)?Scale.+|(\\.)?Subset.+|This section.+|The following.+|This set of questions.+|In this section.+|A .+|Two types of media.+|Among national.+","\\.") %>%
  str_replace(" \\.","")

additional_versions <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>%
  str_extract("Additional (V|v)ersions:[\\s\\w\\d[:punct:]]+") %>%
  str_replace("Additional (V|v)ersions:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace("(\\.)?Compiler.+|(\\.)?Additional versions.+|(\\.)?Available versions.+|(\\.)?Question.+|(\\.)?Clarification.+|(\\.)?Scale.+|(\\.)?Subset.+|This section.+|The following.+|This set of questions.+|In this section.+|A .+|Two types of media.+|Among national.+","\\.") %>%
  str_replace(" \\.","")

available_versions <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>%
  str_extract("Available (V|v)ersions:[\\s\\w\\d[:punct:]]+") %>%
  str_replace("Available (V|v)ersions:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace("(\\.)?Compiler.+|(\\.)?Additional versions.+|(\\.)?Available versions.+|(\\.)?Question.+|(\\.)?Clarification.+|(\\.)?Scale.+|(\\.)?Subset.+|This section.+|The following.+|This set of questions.+|In this section.+|A .+|Two types of media.+|Among national.+","\\.") %>%
  str_replace(" \\.","")

# test
# na.omit(project_managers)
# na.omit(additional_versions)

Finally, we extract all the fields in the codebook, and run some tests.

extraction_pattern <- "Question(s)?:.+|Clarification(s)?:.+|Aggregation(s)?:.+|Response(\\(s\\))?:.+|Responses:.+|Source(\\(s\\))?:.+|Scale(s)?:.+|Source(s)?:.+|Note(s)?:.+|Answer type(s)?:.+|Data release(s)?:.+|Citation(s)?:.+|(CCP )?[O|o]rdering(s)?:.+|Cross-coder aggregation(s)?:.+|Coverage:.+|Years:.+|Cleaning:.+|Date specific:.+"

questions <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>%
  str_extract(regex("Question(s)?:[\\s\\w\\d[:punct:]]+", multiline= TRUE)) %>%
  str_replace("Question(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

clarifications <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>%
  str_extract(regex("Clarification(s)?:[\\s\\w\\d[:punct:]]+", multiline= TRUE)) %>%
  str_replace("Clarification(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"\\.") %>%
  str_trim()

aggregation <-  split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>% 
  str_extract("Aggregation(s)?:[\\s\\w\\d[:punct:]=\\+\\(\\)\\*/\\^\\∗]+.+") %>%
  str_replace("Aggregation(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

responses <- split_vdem %>% 
  str_replace_all("\\r\\n"," ") %>% 
  str_extract("Response(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Response(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

sources <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Source(\\(s\\))?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Source(\\(s\\))?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

scale <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Scale(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Scale(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

data_release <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Data release(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Data release(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(paste0(extraction_pattern,"|Citation:"),"") %>%
  str_trim()

citation <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Citation(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Citation(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(paste0(extraction_pattern,"|[0-9]+(\\.)[0-9]+(\\.)?[0-9]*.+|2 Mid-Level Democracy.+|V\\-Dem Indicators 3 Elections.+|[0-9][0-9]? [A-Z].+|Part III.+"),"") %>%
  str_trim()

ordering <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("[O|o]rdering(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Ordering(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

ordering[ordering == ""] <- NA

cross_coder <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Cross-coder aggregation(s)?:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Cross-coder aggregation(s)?:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

answer_type <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Answer-type:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Answer-type:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

notes <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Notes:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Notes:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

cleaning <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Cleaning:[\\s\\w\\d[:punct:]=+\\(\\)\\*/]+") %>%
  str_replace("Cleaning:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

years <- split_vdem %>% str_replace_all("\\r\\n"," ") %>% 
  str_extract("Years:[\\s\\d[:punct:]]+[\\d]{4}") %>%
  str_replace("Years:( )?", "") %>%
  str_replace_all("[ ]{2,}", " ") %>%
  str_replace(extraction_pattern,"") %>%
  str_trim()

For example, here's what some of the fields look like:

na.omit(sources) %>% 
  unique() %>%
  as_tibble() %>% 
  knitr::kable(col.names = c("Sources"))

na.omit(responses) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = c("Responses"))

na.omit(citation) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = c("Citation"))

na.omit(ordering) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = "Ordering")

na.omit(cross_coder) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = "Cross-coder")

na.omit(scale) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = "Scale")

na.omit(data_release) %>% 
  unique() %>%
  as_tibble() %>%
  knitr::kable(col.names = "Data release")

vdem_codebook <- tibble(number = number, 
                            name = var_names, 
                            label = labels, type = type_var,
                            project_manager = project_managers, 
                            additional_versions = additional_versions, 
                            available_versions = available_versions, 
                            question = questions, 
                            clarification = clarifications, 
                            responses = responses, answer_type = answer_type, 
                            scale =scale, ordering = ordering, 
                            aggregation = aggregation, 
                            cross_coder = cross_coder, 
                            data_release = data_release, source = sources,
                            cleaning = cleaning, 
                            citation = citation, years = years, note = notes)

Finally, we do some manual fixes.

# Manual fixes

vdem_codebook <- vdem_codebook %>%
  mutate(question = str_replace_all(question, "- ", ""),
         clarification = str_replace_all(clarification, "- ", ""),
         responses = str_replace_all(responses, "- ", ""),
         aggregation = str_replace_all(aggregation, "- ", ""),
         cross_coder = str_replace_all(cross_coder, "- ", ""),
         note = str_replace_all(note, "- ", ""))

vdem_codebook <- vdem_codebook %>% 
  mutate(scale = str_replace_all(scale, "\\.$", ""),
         name = str_replace_all(name, "_ ","_"),
         cross_coder = plyr::mapvalues(cross_coder, from = c("Bayesian item response theory measurement model (see V-Dem Methodology).",
                                                             "Bayesian item response theory measurement model (see V-Dem Methodology",
                                                             "Bayesian item response theory measurement model (see V-Dem Methodology, posted at V-Dem.net).",
                                                             "Bayesian item response theory measurement model (see V-Dem Methodology,)."),
                                       to = rep("Bayesian item response theory measurement model (see V-Dem Methodology)",4)))


vdem_codebook <- vdem_codebook %>%
  mutate(section = str_extract(number, "^[0-9]+") %>%
           str_replace("\\.","") %>%
           as.integer()) %>%
  select(section, everything())

skimr::skim(vdem_codebook) %>%
  knitr::kable()

usethis::use_data(vdem_codebook, overwrite = TRUE)

vdem_codebook$name[ vdem_codebook$name %in%  names(VDem_plus) ]

vdem_codebook$name[ !vdem_codebook$name %in%  names(VDem_plus) ]

names(VDem_plus)[ !names(VDem_plus) %in% vdem_codebook$name  ]