#####
# init
#####
# clear old data
rm(list=ls(all=TRUE))
# load packages
library("data.table")
library("sdlutils")
library("readxl")
# set working directory
(wd = "C:/gdrive/rpackages/pesdint/data-raw/government/cpi_corruption")
setwd(wd)
#####
# read data 2019
#####
path = "cpi_2019.xlsx"
col_names = c("country_name", "country_code", "region", "cpi_score", "cpi_rank", "cpi_se", "cpi_sources", "cpi_ci_low", "cpi_ci_high", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_eiu_cr", "cpi_fh_nit", "cpi_gi_crr", "cpi_imd_wcy", "cpi_perc_arg", "cpi_prs_icrg", "cpi_vodp", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol")
dt_19 = data.table(read_excel(path, sheet = 1, skip = 3, col_names = col_names))
dt_19[, year := 2019]
#####
# read data 2018
#####
path = "cpi_2018.xlsx"
col_names = c("country_name", "country_code", "region", "cpi_score", "cpi_rank", "cpi_se", "cpi_sources", "cpi_ci_high", "cpi_ci_low", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_eiu_cr", "cpi_fh_nit", "cpi_gi_crr", "cpi_imd_wcy", "cpi_perc_arg", "cpi_prs_icrg", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol", "cpi_vodp")
dt_18 = data.table(read_excel(path, sheet = 1, skip = 3, col_names = col_names))
dt_18[, year := 2018]
#####
# read data 2017
#####
path = "cpi_2017.xlsx"
list1 = excel_sheets(path)[-c(1:3)]
list2 = lapply(list1, read_excel, path = path, skip = 2)
dt_17 = rbindlist(list2, fill = TRUE)
# standard column names
colnames(dt_17) = c("country_name", "country_code", "cpi_score", "cpi_rank", "cpi_se", "cpi_ci_low", "cpi_ci_high", "cpi_sources", "cpi_wb_cpia", "cpi_wef_eos", "cpi_gi_crr", "cpi_bf_bti", "cpi_adb_cpia", "cpi_imd_wcy", "cpi_bf_sgi", "cpi_wjp_rol", "cpi_prs_icrg", "cpi_vodp", "cpi_eiu_cr", "cpi_fh_nit", "cpi_perc_arg", "dup1", "dup2", "dup3", "dup4")
# remove duplicates
dt_17 = dt_17[(!duplicated(country_name)) & (!is.na(country_name)) & !(country_name == "REGIONAL AVERAGE"), ]
dt_17[, year := 2017]
#####
# read data 2016
#####
path = "cpi_2016.xlsx"
col_names = c("country_name", "cpi_score", "cpi_rank", "region", "country_code", "cpi_wb_cpia", "cpi_wef_eos", "cpi_gi_crr", "cpi_bf_bti", "cpi_adb_cpia", "cpi_imd_wcy", "cpi_bf_sgi", "cpi_wjp_rol", "cpi_prs_icrg", "cpi_vodp", "cpi_eiu_cr", "cpi_fh_nit", "cpi_perc_arg", "dup1", "dup2", "dup3", "cpi_sources", "cpi_se", "cpi_ci_low", "cpi_ci_high", "cpi_min", "cpi_max", "oecd", "g20", "brics", "eu", "arab", "n/a")
dt_16 = data.table(read_excel(path, sheet = 1, skip = 1, col_names = col_names))
dt_16[, year := 2016]
#####
# read data 2015
#####
path = "cpi_2015.xlsx"
col_names = c("cpi_rank", "country_name", "country_code", "region", "cpi_score", "cpi_sources", "cpi_se", "cpi_min", "cpi_max", "cpi_ci_low", "cpi_ci_high", "cpi_wb_cpia", "cpi_wef_eos", "cpi_bf_bti", "cpi_adb_cpia", "cpi_imd_wcy", "cpi_bf_sgi", "cpi_wjp_rol", "cpi_prs_icrg", "cpi_eiu_cr", "cpi_ihs_gi", "cpi_perc_arg", "cpi_fh_nit")
dt_15 = data.table(read_excel(path, sheet = 1, skip = 1, col_names = col_names))
dt_15[, year := 2015]
#####
# 2014
#####
path = "cpi_2014.xlsx"
col_names = c("cpi_rank", "country_name", "country_code", "region", "cpi_score", "cpi_sources", "cpi_se", "cpi_min", "cpi_max", "cpi_ci_low", "cpi_ci_high", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_imd_wcy", "cpi_prs_icrg", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol", "cpi_eiu_cr", "cpi_ihs_gi", "cpi_perc_arg", "cpi_fh_nit")
dt_14 = data.table(read_excel(path, sheet = 1, skip = 1, col_names = col_names))
dt_14[, year := 2014]
#####
# 2013
#####
path = "cpi_2013.xlsx"
col_names = c("cpi_rank", "country_name", "country_code", "ifs_code", "region", "dup1", "cpi_score", "cpi_sources", "cpi_se", "cpi_ci_low", "cpi_ci_high", "cpi_min", "cpi_max", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_imd_wcy", "cpi_prs_icrg", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol", "cpi_eiu_cr", "cpi_ihs_gi", "cpi_perc_arg", "cpi_ti_bps", "cpi_fh_nit")
dt_13 = data.table(read_excel(path, sheet = 1, skip = 3, col_names = col_names))
dt_13[, year := 2013]
#####
# 2012
#####
path = "cpi_2012.xlsx"
col_names = c("cpi_rank", "country_name", "region", "cpi_score", "dup1", "cpi_sources", "cpi_se", "cpi_ci_low", "cpi_ci_high", "cpi_min", "cpi_max", "cpi_adb_gr", "cpi_bf_sgi", "cpi_bf_bti", "cpi_imd_wcy", "cpi_prs_icrg", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol", "cpi_eiu_cr", "cpi_ihs_gi", "cpi_perc_arg", "cpi_ti_bps", "cpi_fh_nit")
dt_12 = data.table(read_excel(path, sheet = 1, skip = 2, col_names = col_names))
dt_12[, year := 2012]
#####
# 2011
#####
path = "cpi_2011.xlsx"
col_names = c("cpi_rank", "country_name", "cpi_score", "dup1", "cpi_sources", "cpi_se", "cpi_min", "cpi_max", "cpi_ci_low", "cpi_ci_high", "cpi_adb_gr", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_eiu_cr", "cpi_fh_nit", "cpi_ihs_gi", "cpi_imd_wcy_2010", "cpi_imd_wcy", "cpi_perc_arg_2010", "cpi_perc_arg", "cpi_prs_icrg", "cpi_ti_bps", "cpi_wb_cpia", "cpi_wef_eos_2010", "cpi_wef_eos", "cpi_wjp_rol")
dt_11 = data.table(read_excel(path, sheet = 1, skip = 2, col_names = col_names))
dt_11[, year := 2011]
#####
# 2010
#####
path = "cpi_2010.xlsx"
col_names = c("cpi_rank", "country_name", "cpi_score", "cpi_sources", "cpi_se", "cpi_min", "cpi_max", "cpi_ci_low", "cpi_ci_high", "cpi_adb_cpia", "cpi_adb_gr", "cpi_bf_bti", "cpi_eiu_cr", "cpi_fh_nit", "cpi_ihs_gi", "cpi_imd_wcy_2009", "cpi_imd_wcy", "cpi_perc_arg_2009", "cpi_perc_arg", "cpi_wb_cpia", "cpi_wef_eos_2009", "cpi_wef_eos")
dt_10 = data.table(read_excel(path, sheet = 1, skip = 4, col_names = col_names))
dt_10[, year := 2010]
#####
# 2009-1997
#####
download_fread_csv = function(year) {
path = paste0("cpi_", year, ".csv")
dt = fread(path)
dt[, year := year]
# standard column names
colnames(dt) = cols_97_09[1:dim(dt)[2]]
# return
return(dt)
}
cols_97_09 = c("country_name", "country_code", "region", "cpi_score", "cpi_rank", "interval")
list_09 = lapply(1997:2009, download_fread_csv)
#####
# 1980-85, 1988-92, 1993-96
#####
# manually exported to spreadsheet pages 5-6 with Adobe Acrobat DC: https://www.transparency.org/files/content/tool/1996_CPI_EN.pdf
# manually added variances for 1993-96, 1988-92, 1980-85
# dt = data.table(read_excel("C:/Users/desk/Downloads/1996_CPI_EN.xlsx"))
# dput(dt)
dt = structure(list(country_name = c("New Zealand", "Denmark", "Sweden", "Finland", "Canada", "Norway", "Singapore", "Switzerland", "Netherlands", "Australia", "Ireland", "United Kingdom", "Germany", "Israel", "United States", "Austria", "Japan", "Hong Kong", "France", "Belgium", "Chile", "Portugal", "South Africa", "Poland", "Czech Republic", "Malaysia", "Korea", "Greece", "Taiwan", "Jordan", "Hungary", "Spain", "Turkey", "Italy", "Argentina", "Bolivia", "Thailand", "Mexico", "Ecuador", "Brazil", "Egypt", "Colombia", "Uganda", "Philippines", "Indonesia", "India", "Russia (USSR)", "Venezuela", "Cameroon", "China", "Bangladesh", "Kenya", "Pakistan", "Nigeria"),
cpi_93_96 = c(9.43, 9.33, 9.08, 9.05, 8.96, 8.87, 8.80, 8.76, 8.71, 8.60, 8.45, 8.44, 8.27, 7.71, 7.66, 7.59, 7.05, 7.01, 6.96, 6.84, 6.80, 6.53, 5.68, 5.57, 5.37, 5.32, 5.02, 5.01, 4.98, 4.89, 4.86, 4.31, 3.54, 3.42, 3.41, 3.40, 3.33, 3.30, 3.19, 2.96, 2.84, 2.73, 2.71, 2.69, 2.65, 2.63, 2.58, 2.50, 2.46, 2.43, 2.29, 2.21, 1.00, 0.69),
variance_93_96 = c(0.39, 0.44, 0.3, 0.23, 0.15, 0.2, 2.36, 0.24, 0.25, 0.48, 0.44, 0.25, 0.53, 1.41, 0.19, 0.41, 2.61, 1.79, 1.58, 1.41, 2.53, 1.17, 3.3, 3.63, 2.11, 0.13, 2.3, 3.37, 0.87, 0.17, 2.19, 2.48, 0.3, 4.78, 0.54, 0.64, 1.24, 0.22, 0.42, 1.07, 6.64, 2.41, 8.72, 0.49, 0.95, 0.12, 0.94, 0.4, 2.98, 0.52, 1.57, 3.69, 2.52, 6.37),
sources_93_96 = c(6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 7, 6, 5, 7, 6, 9, 9, 6, 6, 7, 6, 6, 4, 4, 9, 9, 6, 9, 4, 6, 6, 6, 6, 6, 4, 10, 7, 4, 7, 4, 6, 4, 8, 10, 9, 5, 7, 4, 9, 4, 4, 5, 4),
cpi_88_92 = c(9.30, 8.88, 8.71, 8.88, 8.97, 8.69, 9.16, 9.00, 9.03, 8.20, 7.68, 8.26, 8.13, 7.44, 7.76, 7.14, 7.25, 6.87, 7.45, 7.40, 5.51, 5.50, 7.00, 5.20, 5.20, 5.10, 3.50, 5.05, 5.14, 5.51, 5.22, 5.06, 4.05, 4.30, 5.91, 1.34, 1.85, 2.23, 3.27, 3.51, 1.75, 2.71, 3.27, 1.96, 0.57, 2.89, 3.27, 2.50, 3.43, 4.73, 0.00, 1.60, 1.90, 0.63),
sources_88_92 = c(3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 4, 4, 3, 3, 2, 3, 3, 1, 1, 4, 4, 3, 4, 2, 2, 3, 3, 3, 2, 1, 4, 3, 2, 3, 2, 2, 1, 3, 4, 3, 1, 3, 2, 3, 1, 2, 3, 2),
cpi_80_85 = c(8.41, 8.01, 8.01, 8.14, 8.41, 8.41, 8.41, 8.41, 8.41, 8.41, 8.28, 8.01, 8.14, 7.27, 8.41, 7.35, 7.75, 7.35, 8.41, 8.28, 6.53, 4.46, 7.35, 3.64, 5.13, 6.29, 3.93, 4.20, 5.95, 5.30, 1.63, 6.82, 4.06, 4.86, 4.94, 0.67, 2.42, 1.87, 4.54, 4.67, 1.12, 3.27, 0.67, 1.04, 0.20, 3.67, 5.13, 3.19, 4.59, 5.13, 0.78, 3.27, 1.52, 0.99),
sources_80_85 = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2)),
row.names = c(NA, -54L),
class = c("data.table", "data.frame"))
cols_early = c("country_name", "cpi_score", "cpi_sources", "year")
dt_93 = rbindlist(lapply(1996:1993, function(x){dt[, c("country_name", "cpi_93_96", "sources_93_96"), with = FALSE][, year := x]}))
colnames(dt_93) = cols_early
dt_88 = rbindlist(lapply(1992:1988, function(x){dt[, c("country_name", "cpi_88_92", "sources_88_92"), with = FALSE][, year := x]}))
colnames(dt_88) = cols_early
dt_80 = rbindlist(lapply(1985:1980, function(x){dt[, c("country_name", "cpi_80_85", "sources_80_85"), with = FALSE][, year := x]}))
colnames(dt_80) = cols_early
#####
# combine data
#####
cpi = rbindlist(c(list(dt_19, dt_18, dt_17, dt_16, dt_15, dt_14, dt_13, dt_12, dt_11, dt_10), list_09, list(dt_93, dt_88, dt_80)), fill=TRUE)
#####
# clean country names
#####
# qa checks
# sort(unique(cpi[["country_name"]]))
# character conversions
country_names = cpi[["country_name"]]
country_names = gsub("&", "and", country_names)
country_names = gsub("´|’", "'", country_names)
country_names = gsub("\\.|,|\\(|\\)", "", country_names)
country_names = tolower(country_names)
country_names = gsub(" the | of | of the ", " ", country_names)
country_names = gsub(" ", " ", country_names)
country_names = gsub("^the | of$| the$", "", country_names)
# assign
cpi[, country_name := country_names]
# brunei
cpi[grep("brunei", country_name), country_name := "brunei"]
# cabo verde
cpi[grep("verde", country_name), country_name := "cabo verde"]
# cote d'ivoire
cpi[grep("voire|ivory", country_name), country_name := "cote d'ivoire"]
# czechia
cpi[grep("czech", country_name), country_name := "czechia"]
# dominican republic
cpi[grep("dominican", country_name), country_name := "dominican republic"]
# eswatini
cpi[grep("swaziland", country_name), country_name := "eswatini"]
# guinea bissau
cpi[grep("bissau", country_name), country_name := "guinea-bissau"]
# kiribati
cpi[grep("ribati", country_name), country_name := "kiribati"]
# kyrgyzstan
cpi[grep("kyrgyz", country_name), country_name := "kyrgyzstan"]
# kuwait
cpi[grep("kuw.it", country_name), country_name := "kuwait"]
# macao
cpi[grep("maca", country_name), country_name := "macao"]
# macedonia
cpi[grep("maced", country_name), country_name := "north macedonia"]
# moldova
cpi[grep("moldov", country_name), country_name := "moldova"]
# palestine
cpi[grep("palest", country_name), country_name := "palestine"]
# slovakia
cpi[grep("slovak", country_name), country_name := "slovakia"]
# vietnam
cpi[grep("viet {0,1}nam", country_name), country_name := "vietnam"]
# usa
cpi[grep("^usa$|united states", country_name), country_name := "united states"]
# qa check
# grep("brunei|verde|voire|ivory|czech|dominican|swaziland|bissau|ribati|kyrgyz|kuw.it|maca|maced|moldov|palest|slovak|viet {0,1}nam|united states", unique(cpi[["country_name"]]), value = TRUE)
# dissolution: soviet union
cpi[country_name %in% c("russia ussr", "russia"), country_name := c("soviet union", "russia")[(year > 1991) + 1], by = c("year")]
# qa check
(l = grep("russia|soviet", unique(cpi[["country_name"]]), value = TRUE))
length(l) == 2
# split: congos
cpi[country_name %in% c("congo democratic republic", "democratic republic congo"), country_name := "congo-kinshasa"]
cpi[country_name %in% c("congo", "congo republic", "republic congo", "congo brazzaville"), country_name := "congo-brazzaville"]
# qa check
(l = grep("congo", unique(cpi[["country_name"]]), value = TRUE))
length(l) == 2
# split: czechoslovakia (occurred in 1993 but CPI didn't add Slovakia until 1997)
cpi[(country_name %in% c("czechia")) & (year < 1997), country_name := c("czechoslovakia")]
(l = grep("czech|slovak", unique(cpi[["country_name"]]), value = TRUE))
length(l) == 3
# split: koreas (occurred in 1950 but CPI didn't add North until 1997)
cpi[(country_name %in% c("korea")) & (year < 1997), country_name := c("korea south")]
cpi[(country_name %in% c("south korea")), country_name := c("korea south")]
# qa check
(l = grep("korea", unique(cpi[["country_name"]]), value = TRUE))
length(l) == 2
# split: sudans
cpi[(country_name == "sudan") & (year >= 2013), country_name := "sudan north"]
(l = grep("sudan", unique(cpi[["country_name"]]), value = TRUE))
length(l) == 3
# qa checks
# View(cpi[is.na(country_name), ])
# View(cpi[(is.na(country_name)) | (country_name %in% c("Country", "Country/Territory", "REGIONAL AVERAGE")), ])
# sort(unique(cpi[["country_name"]]))
#####
# convert country names
#####
load("../../data/countries.rda")
# convert country codes
short_convert = setNames(countries[["master_short"]], countries[["cpi_country"]])
cpi[, country_code := short_convert[country_name]]
# convert country names
long_convert = setNames(countries[["master_long"]], countries[["cpi_country"]])
cpi[, country_name := long_convert[country_name]]
# qa checks
# dim(cpi[is.na(country_name2), ])
#####
# convert cpi score to numeric
#####
cpi[, cpi_score := gsub(",", ".", cpi_score)]
cpi[, cpi_score := as.numeric(cpi_score)]
#####
# minimum & maximum from interval
#####
pattern = "^(-{0,1}[[:digit:]]).([[:digit:]])[^[:digit:]]*([[:digit:]]){0,2}.([[:digit:]]){0,1}$"
cpi[!is.na(interval), `:=`(cpi_min = as.numeric(gsub(pattern, "\\1.\\2", interval)),
cpi_max = as.numeric(gsub(pattern, "\\3.\\4", interval))), by = c("country_name", "year")]
# qa check
# View(cpi[!is.na(interval) & (is.na(cpi_min) | is.na(cpi_max)), c("country_name", "country_code", "year", "cpi_score", "interval", "cpi_min", "cpi_max"), with = FALSE])
#####
# version
#####
cpi[year %in% 2012:2019, `:=`(cpi_version = 3,
cpi_weight = 1)]
cpi[year %in% 1998:2011, `:=`(cpi_version = 2,
cpi_weight = 1)]
cpi[year %in% 1980:1997, cpi_version := 1]
cpi[year %in% 1997, cpi_weight := 1]
cpi[year %in% 1993:1996, cpi_weight := 1/4]
cpi[year %in% 1988:1992, cpi_weight := 1/5]
cpi[year %in% 1980:1985, cpi_weight := 1/6]
# View(cpi[is.na(version), ])
#####
# rescale cpi scores to same scale 0-1
#####
# rescale values to 0-1
cpi[cpi_version %in% 1:2, cpi_rescaled := cpi_score / 10]
cpi[cpi_version %in% 3, cpi_rescaled := cpi_score / 100]
# error in Brunei 2010
cpi[cpi_rescaled > 1, `:=`(cpi_score = cpi_score / 10,
cpi_rescaled = cpi_rescaled / 10)]
# #####
# # link cpi scores across 2011-2012: method 1 (not used -- not close enough)
# #####
# # set both 2012 and 2011 as having same standard deviation
# cpi[year >= 2012, cpi_linked := cpi_rescaled]
# var12 = var(cpi[year == 2012, ][["cpi_rescaled"]])
# var11 = var(cpi[year == 2011, ][["cpi_rescaled"]])
# coef = sqrt(var12) / sqrt(var11)
# cpi[year <= 2011, cpi_linked := cpi_rescaled * coef]
# m12 = mean(cpi[year == 2012, ][["cpi_linked"]])
# m11 = mean(cpi[year == 2011, ][["cpi_linked"]])
# diff = m12 - m11
# cpi[year <= 2011, cpi_linked := cpi_linked + diff]
#####
# link cpi scores across 2011-2012: method 2
#####
# observed values
obsr12 = cpi[year == 2012, ][["cpi_rescaled"]]
obsr11 = cpi[year == 2011, ][["cpi_rescaled"]]
obsrle11 = cpi[year <= 2011, ][["cpi_rescaled"]]
length(obsrle11)
is.list(obsrle11)
# # start plot
# plot(density(obsr11), col="blue")
# par(new=TRUE)
# plot(density(obsr12), col="red")
# set parameters
nn = 101 # number of entries
series = 1:nn # 1 to number of entries
dens12 = density(obsr12, n = nn, from = 0, to = 1) # density for 2012
# determine usual jitter between years
jitters = numeric(0)
for (j in 2013:2017) {
obsr1 = cpi[year == j, ][["cpi_rescaled"]]
dens1 = density(obsr1, n = nn, from = 0, to = 1)
obsr2 = cpi[year == (j-1), ][["cpi_rescaled"]]
dens2 = density(obsr2, n = nn, from = 0, to = 1)
diff = dens1$y - dens2$y
jitters = c(jitters, abs(median(diff)))
}
jitter = mean(jitters)
# model fitting
set.seed(2011)
for (j in 1:25) {
dens11 = density(obsr11, n = nn, from = 0, to = 1) # density for 2011
diff = dens11$y - dens12$y # difference in densities
diff = diff + (rnorm(n = nn) * jitter * 10) # add jitter
print(c(j, abs(median(diff))))
move = rep(x = 0, times = nn) # 0*n movement vector
for (i in series) {
rel_pos = series - i
n = dnorm(x = rel_pos, mean = 0, sd = 20) # normal curve to higher-weight closer positions
s = sign(rel_pos) # positions to the left of this position should move right, etc.
d = diff[i] # negative positions pull, positive positions push
weight = n * s * d / 10
move = move + weight
}
# function to identify correct approximate weight
transfunc = Vectorize(function(x) {
x100 = floor(x * 100) + 1
x100 = min(101, max(1, x100))
r = x + move[x100]
r = min(1, max(0, r))
return(r)
})
obsr11 = transfunc(obsr11)
obsrle11 = transfunc(obsrle11)
# par(new=TRUE)
# plot(density(obsr11), col="green")
}
cpi[year >= 2012, cpi_linked := cpi_rescaled]
cpi[year <= 2011, cpi_linked := obsrle11]
# #####
# # diagnostics for linking cpi scores
# #####
# gdt = cpi[year %in% (2011-2):(2012+2), ]
#
# # ks test
# dis1 = gdt[year >= 2012, ][["cpi_linked"]]
# dis2 = gdt[year <= 2011, ][["cpi_linked"]]
# ks.test(dis1, dis2)
#
# library("ggplot2")
# library("gridExtra")
# # distribution plots
# g1 = ggplot(gdt, mapping = aes(fill = factor(year), color = factor(year), x = cpi_rescaled)) + geom_density(alpha = .25)
# g2 = ggplot(gdt, mapping = aes(fill = factor(year), color = factor(year), x = cpi_linked)) + geom_density(alpha = .25)
# g3 = ggplot(gdt, mapping = aes(fill = year>=2012, color = year>=2012, x = cpi_rescaled)) + geom_histogram(aes(y = ..density..), alpha = .25, breaks = seq(0, 1, .05), position = "identity") + geom_density(alpha = .25, mapping = aes(fill = NULL))
# g4 = ggplot(gdt, mapping = aes(fill = year>=2012, color = year>=2012, x = cpi_linked)) + geom_histogram(aes(y = ..density..), alpha = .25, breaks = seq(0, 1, .05), position = "identity") + geom_density(alpha = .25, mapping = aes(fill = NULL))
# grid.arrange(g1,g2,g3,g4)
#
# # change over time
# cpi[, `:=`(cpi_rescaled_delta = shift(cpi_rescaled) - cpi_rescaled,
# cpi_linked_delta = shift(cpi_linked) - cpi_linked), by = c("country_name")]
#
# # constant set of countries across years (no effects from country inclusion / disinclusion)
# names = copy(cpi)[, .(N = .N), by = c("country_name")][N == max(N), ][["country_name"]]
# cdt = copy(cpi[country_name %in% names, ])
#
# # country cpi from year to year
# mdt = melt(cdt[, c("year", "country_code", "cpi_rescaled", "cpi_linked")], id.vars = c("year", "country_code"))
# ggplot(mdt, aes(x = year, y = value, color = variable, group = variable)) +
# facet_wrap(~ country_code) +
# geom_line() +
# geom_vline(xintercept = 2011.5)
#
# # mean country cpi from year to year
# sdt = cdt[, .(mean_rescaled = mean(cpi_rescaled),
# mean_linked = mean(cpi_linked)), by = c("year")]
# mdt = melt(sdt, id.vars = "year")
# ggplot(mdt, aes(x = year, y = value, color = variable, group = variable)) +
# geom_point() + geom_line() +
# geom_vline(xintercept = 2011.5)
#
# # country delta cpi from year to year
# mdt = melt(cdt[, c("year", "country_code", "cpi_rescaled_delta", "cpi_linked_delta")], id.vars = c("year", "country_code"))
# ggplot(mdt, aes(x = year, y = value, color = variable, group = variable)) +
# facet_wrap(~ country_code) +
# geom_line() +
# geom_vline(xintercept = 2011.5)
#
# # mean delta cpi from year to year
# sdt = cdt[, .(mean_rescaled = mean(cpi_rescaled_delta),
# mean_linked = mean(cpi_linked_delta)), by = c("year")]
# mdt = melt(sdt, id.vars = "year")
# ggplot(mdt, aes(x = year, y = value, color = variable, group = variable)) +
# geom_point() + geom_line() +
# geom_vline(xintercept = 2011.5)
#
# # distribution of delta cpi from year to year
# ggplot(cdt, aes(x = factor(year), y = cpi_linked_delta)) +
# geom_violin() +
# geom_vline(xintercept = 2011.5)
#####
# subset data
#####
cols_keep = c("country_code", "country_name", "year", "cpi_version", "cpi_weight", "cpi_score", "cpi_rescaled", "cpi_linked", "cpi_sources", "cpi_se", "cpi_min", "cpi_max", "cpi_adb_cpia", "cpi_bf_sgi", "cpi_bf_bti", "cpi_eiu_cr", "cpi_fh_nit", "cpi_gi_crr", "cpi_imd_wcy", "cpi_perc_arg", "cpi_prs_icrg", "cpi_vodp", "cpi_wb_cpia", "cpi_wef_eos", "cpi_wjp_rol")
cpi = cpi[, cols_keep, with = FALSE]
#####
# save data
#####
# rename dataset (doesn't use memory until acted upon)
govt_cpi_corruption = cpi
usethis::use_data(govt_cpi_corruption, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.