Nothing
###############################################################################
### Get CN8+ code
### 12-11-2021
### Christoph Baumgartner & Janette Walde
###############################################################################
#' @importFrom stats na.omit setNames
#' @importFrom utils read.csv2
#' @export harmonize_cn8
harmonize_cn8 <- function(b, e, historymatrix = NULL, harmonize.to = "e",
HS6breaks = c(1992, 1996, 2002, 2007, 2012, 2017),
progress = TRUE) {
#########################
### input check
#########################
if(b >= e) {
stop("The entered last year of interest (e) is smaller than the first year of interest (b). Please correct.")
}
if (length(b) != 1 | !b%%1 == 0 | b > as.integer(substr(date(), start = 21, stop = 24))) {
stop(paste0("The entered first year of interest (b) has to be a single integer value, which has to be smaller than ", as.integer(substr(date(), start = 21, stop = 24)), ". Please correct."))
}
if (length(e) != 1 | !e%%1 == 0 | e > as.integer(substr(date(), start = 21, stop = 24))) {
stop(paste0("The entered last year of interest (e) has to be a single integer value, which has to be smaller than ", as.integer(substr(date(), start = 21, stop = 24)), ". Please correct."))
}
if (!file.exists(paste0(system.file("extdata", package = "harmonizer"), "/CN8/CN8_", b, ".rds"))) {
stop("There is no data avilable for the first year of interest (b). Consider changing the time periode, or alter data (-> get_data_directory()).")
}
if (!file.exists(paste0(system.file("extdata", package = "harmonizer"), "/CN8/CN8_", e, ".rds"))) {
stop("There is no data avilable for the last year of interest (e). Consider changing the time periode, or alter data (-> get_data_directory()).")
}
#########################
### define harmonization start/end
#########################
if (!(harmonize.to %in% c("b", "e"))) {
stop("Invalid value of 'harmonize.to', 'harmonize.to' must be set to 'e' or 'b'.")
}
if(harmonize.to == "e") {
harm_year <- e
} else {
harm_year <- b
}
#########################
### check runtime
#########################
assign("runtime", readRDS(paste0(system.file("extdata", package = "harmonizer"), "/runtime/runtime_cn8_pc8.rds")))
expect_time <- runtime$time[runtime$functions == "harmonize_cn8" & runtime$start == b & runtime$end == e]
h <- as.integer(expect_time / 3600)
m <- round((expect_time %% 3600) / 60, 0)
if (m > 10) { # round up to next 10 min
m <- round(m + 5, -1)
} else if(m == 0) {
m <- 1
}
if(length(h) > 0 & length(m) > 0) {
if(h > 0) {
print(paste0("This process may take up to ", h, " hours and ", m, " minute(s) (--> ~ ", Sys.time() + h * 3600 + m * 60, ")."))
} else {
print(paste0("This process may take up to ", m, " minute(s) (--> ~ ", Sys.time() + h * 3600 + m * 60, ")."))
}
} else {
print("Expected calculation time unkown.")
}
#########################
### history matrix check
#########################
fcalls <- sys.nframe()
if (is.null(historymatrix)) {
CN8_over_time <- history_matrix_cn8(b = b, e = e, progress = progress)
} else {
if (!is.data.frame(historymatrix)) {
stop("The entered history matrix is not a dataframe. Please correct.")
}
if (!any(grep("CN8_", colnames(historymatrix)))) {
stop("Column names of the history matrix are not correct. Please correct.")
}
if (ncol(historymatrix) != (e - b + 3)) {
stop(paste0("The entered history matrix has the wrong number of colums. It has ", ncol(historymatrix), ", while ", (e - b + 3), " are needed for the entered time period. Please correct."))
}
if (length(grep("CN8_", colnames(historymatrix))) != (e - b + 1)) {
stop("The entered history matrix does not have enough columns which contain CN8 codes. It has ", length(grep("CN8_", colnames(historymatrix))), " while ", (e - b + 1), " are needed for the entered time period. Please correct.")
}
if (!all(grep("CN8_", colnames(historymatrix[1:(e - b + 1)])) == seq(from = 1, to = (e - b + 1)))) {
stop("The structure of the entered history matrix is not correct. The first columns have to containt the CN8 codes. Please correct.")
}
if (!all(seq(from = b, to = e) == as.integer(substr(colnames(historymatrix[1:(ncol(historymatrix) - 2)]), start = 5, stop = 8)))) {
stop("The entered history matrix does not match the given time period. Please correct.")
}
CN8_over_time <- historymatrix
fcalls <- fcalls + 2
}
if (fcalls == 1) {
mod_partb <- 3
mod_parte <- 4
} else if (fcalls == 2) {
mod_partb <- 3
mod_parte <- 5
} else {
mod_partb <- 1
mod_parte <- 2
}
# get all files from data folder
filenames <- list.files(paste0(system.file("extdata", package = "harmonizer"), "/HS6"), pattern="*.csv", full.names=TRUE)
# read in possible HS6 breaks
breaks <- HS6breaks
# check which breaks are needed for the specific time period
idx_breaks <- b < breaks & e > breaks
if (any(idx_breaks)) {
# reduce used files
index <- NULL
for (i in 1:length(breaks[idx_breaks])){
index <- c(index,grep(breaks[idx_breaks][i],filenames))
}
index <- unique(index)
filenames <- filenames[index]
correspondence_lists <- lapply(filenames, read.csv2)
# split up data and rename cols
for(i in 1:length(correspondence_lists)) {
# unlist as dataframe
assign(paste0("correspondence_", breaks[idx_breaks][i]), as.data.frame(correspondence_lists[i], responseName = c("HS6", "BEC")))
# # split up in two cols
# assign(paste0("correspondence_", breaks[idx_breaks][i]),
# data.frame(do.call("rbind", strsplit(as.character(eval(parse(text = paste0("correspondence_", breaks[idx_breaks][i], "[[1]]")))),
# ";", fixed = TRUE))), )
# rename cols
assign(paste0("correspondence_", breaks[idx_breaks][i]),
setNames(eval(parse(text = paste0("correspondence_", breaks[idx_breaks][i]))), c("new", "old")))
}
}
CN8_to_BEC <- cn8_to_bec(b = b, e = e, historymatrix = CN8_over_time, progress = FALSE)
change_code <- rep("clear_code",nrow(CN8_over_time))
new_code <- rep(0,nrow(CN8_over_time))
# when did a new code pop up
flag_names <- c("flag", "flagyear")
for (j in 1:(ncol(CN8_over_time) - length(flag_names))) {
for (i in 1:nrow(CN8_over_time)) {
if (is.na(CN8_over_time[i, j])) {
new_code[i] <- substr(colnames(CN8_over_time)[j], start = 5, stop = 10)
next
}
}
}
# product codes changes in time, yes = 1
for (j in 2:(ncol(CN8_over_time) - length(flag_names))) {
for (i in 1:nrow(CN8_over_time)) {
if ((!is.na(CN8_over_time[i, j])) & (CN8_over_time[i, j] != CN8_over_time[i, j - 1]) & (!is.na(CN8_over_time[i, j - 1]))) {
change_code[i] <- 1
}
}
}
CN8_over_time$new_code <- new_code
CN8_over_time$change_code <- change_code
# various changes in the CN8 codes or merge/split
temp3 <- CN8_over_time[CN8_over_time$change_code == 1 | CN8_over_time$new_code != 0 | CN8_over_time$flag==1, !(names(CN8_over_time) %in% flag_names)]
temp3 <- temp3[, !(names(temp3) %in% c("new_code","change_code"))]
tied_codes <- matrix("no_ties",nrow(temp3),1)
if (progress) {
print(paste0("Work in progress... Part ", mod_partb, "/" , mod_parte,": 0%"))
}
for (i in 1:nrow(temp3)){
if (i==nrow(temp3)) break
for (j in (i + 1):nrow(temp3)) {
# check if there is a connection between row i and row j
if (sum(na.omit(t(temp3[i, ])) %in% na.omit(t(temp3[j, ]))) > 0) {
# check if row i has a family yet; if so use it
if (tied_codes[i] != "no_ties" & tied_codes[j] == "no_ties") {
temp_fam <- tied_codes[i]
tied_codes[j] <- temp_fam
}
if (tied_codes[i] == "no_ties" & tied_codes[j] != "no_ties") {
temp_fam <- tied_codes[j]
tied_codes[i] <- temp_fam
}
if (tied_codes[i] != "no_ties" & tied_codes[j] != "no_ties") {
if (tied_codes[i] != tied_codes[j]){
temp_fam <- tied_codes[i]
tied_codes[tied_codes==tied_codes[j]] <- temp_fam
}
}
if (tied_codes[i] == "no_ties" & tied_codes[j] == "no_ties") {
tied_codes[j] <- paste0("f_", i)
tied_codes[i] <- paste0("f_", i)
}
}
}
if (progress) {
if (i %% 100 == 0) {
print(paste0("Work in progress... Part ", mod_partb, "/" , mod_parte, ": ", round(i / nrow(temp3), 3) * 100, "%"))
}
}
}
if (progress) {
print(paste0("Work in progress... Part ", mod_partb, "/" , mod_parte, ": 100%"))
}
# Harmonization to year harm_year
CN8_over_time$CN8plus <- CN8_over_time[,paste0("CN8_", harm_year)]
CN8_over_time$CN8plus[CN8_over_time$change_code == 1 | CN8_over_time$new_code != 0 | CN8_over_time$flag == 1] <- tied_codes
CN8_over_time$CN8plus[CN8_over_time$CN8plus == "no_ties"] <- CN8_over_time[CN8_over_time$CN8plus == "no_ties",paste0("CN8_",harm_year)]
CN8_over_time <- CN8_over_time[,!(colnames(CN8_over_time) %in% c("change_code"))]
# merge CN8_over_time and CN8_to_BEC
CN8_over_time <- merge(CN8_over_time,CN8_to_BEC,by.x = "CN8plus", by.y = "CN8",all.x = TRUE)
##### Are there changes of CN8 but within HS6?
# Define variable family
CN8_over_time$family <- rep(0,dim(CN8_over_time)[1])
CN8_over_time$family[grep("f",CN8_over_time$CN8plus)] <- 1
# HS6 harmonize
CN8_over_time$HS6 <- as.character(CN8_over_time$HS6)
CN8_over_time$HS6plus <- as.character(CN8_over_time$HS6)
if(all(CN8_over_time$family != 0)) {
HS6_temp <- CN8_over_time[CN8_over_time$family == 1, ]
HS6_temp[, grep("CN8_", colnames(HS6_temp))] <- apply(HS6_temp[, grep("CN8_", colnames(HS6_temp))], 2,
substr, start = 1, stop = 6)
fams <- unique(HS6_temp$CN8plus)
if (progress) {
print(paste0("Work in progress... Part ", mod_partb + 1, "/" , mod_parte, ": 0%"))
}
# check if breaks are needed
if (any(idx_breaks)) {
for (i in 1:length(fams)) {
### check consistency of each block
consistent <- vector(mode = "logical")
# nbr of blocks = breaks + 1
for (s in 1:(length(breaks[idx_breaks]) + 1)) {
if (length(breaks[idx_breaks]) == 1) {
# special case for only one break
early_years <- as.integer(substr(colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))], start = 5, stop = 8)) < breaks[idx_breaks]
early_years_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))][early_years]
late_years_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))][!early_years]
block_1 <- as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], early_years_names])
block_2 <- as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], late_years_names])
if (length(unique(unlist(block_1))) == 1) {
consistent <- c(consistent, TRUE)
} else {
consistent <- c(consistent, FALSE)
}
if (length(unique(unlist(block_2))) == 1) {
consistent <- c(consistent, TRUE)
} else {
consistent <- c(consistent, FALSE)
}
break
} else {
# derive last block (> last break)
if (s == (length(breaks[idx_breaks]) + 1)) {
temp_years <- as.integer(substr(colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))], start = 5, stop = 8)) >= breaks[idx_breaks][length(breaks[idx_breaks])] #breaks[length(breaks)]
temp_years_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))][temp_years]
# get all codes in last "time block"
assign(paste0("block_", s), as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], temp_years_names]))
} else if (s == 1) { # First block
temp_years <- as.integer(substr(colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))], start = 5, stop = 8)) < breaks[idx_breaks][s]
temp_years_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))][temp_years]
# get all codes in one "time block"
assign(paste0("block_", s), as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], temp_years_names]))
} else { # Blocks in between
# for all other blocks but the last the following works
temp_years1 <- as.integer(substr(colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))], start = 5, stop = 8)) < breaks[idx_breaks][s]
temp_years2 <- as.integer(substr(colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))], start = 5, stop = 8)) >= breaks[idx_breaks][s - 1]
temp_years <- temp_years1 & temp_years2
temp_years_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))][temp_years]
# get all codes in one "time block"
assign(paste0("block_", s), as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], temp_years_names]))
}
# check if all codes in this block are the same
if (length(unique(unlist(eval(parse(text = paste0("block_", s)))))) == 1) {
consistent <- c(consistent, TRUE)
} else {
consistent <- c(consistent, FALSE)
}
}
}
### check if all codes across all blocks remain the same
temp_blockvalue <- vector(mode = "character")
for (k in 1:(length(breaks[idx_breaks]) + 1)) {
temp_blockvalue <- c(temp_blockvalue, eval(parse(text = rep(paste0("block_", k, "[[1]][1]")))))
}
if (length(unique(temp_blockvalue)) == 1) {
equal <- TRUE
} else {
equal <- FALSE
}
### check if the blocks are consistent and equal
if (all(consistent) & equal) {
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(unique(unlist(eval(parse(text = "block_1")))),
times = nrow(eval(parse(text = "block_1"))))
} else if (all(consistent)) {
# check if the codes across the blocks differ
# but the changes are consistent with the change list
# loop over all correspondece lists
for (j in 1:sum(idx_breaks)) {
# n ... rownbr of related correspondence list where the code of block_j appears
# m ... rownbr of related correspondence list where the code of block_(j+1) appears
n <- which(eval(parse(text = paste0("block_", j, "[[1]][1]"))) == eval(parse(text = paste0("correspondence_", breaks[idx_breaks][j], "$old"))))
m <- which(eval(parse(text = paste0("block_", j + 1, "[[1]][1]"))) == eval(parse(text = paste0("correspondence_", breaks[idx_breaks][j], "$new"))))
# the code has to appear only once, otherwise no clear assignment can be done
if (length(n) == 1 & length(m) == 1) {
if (m == n) {
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(unique(unlist(eval(parse(text = paste0("block_",
length(breaks[idx_breaks]) + 1))))),
times = nrow(eval(parse(text = paste0("block_",
length(breaks[idx_breaks]) + 1)))))
}
} else {
# if the changes are not consistent with the change list keep the old family
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(fams[i],
times = nrow(eval(parse(text = paste0("block_",
length(breaks[idx_breaks]) + 1)))))
}
}
} else {
# if the changes are not consistent nor equal keep the old family
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(fams[i],
times = nrow(eval(parse(text = paste0("block_",
length(breaks[idx_breaks]) + 1)))))
}
if (progress) {
if (i %% 100 == 0) {
print(paste0("Work in progress... Part ", mod_partb + 1, "/" , mod_parte, ": ", round(i / length(fams), 3) * 100, "%"))
}
}
}
} else {
# case if no changes of HS6 happened in the observed time period
for (i in 1:length(fams)) {
year_names <- colnames(HS6_temp)[grep("CN8_", colnames(HS6_temp))]
block <- as.data.frame(HS6_temp[HS6_temp$CN8plus == fams[i], year_names])
# check the code did not change across the years
if (length(unique(unlist(block))) == 1) {
# if yes - rewrite HS6plus
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(unique(unlist(eval(parse(text = "block")))),
times = nrow(eval(parse(text = "block"))))
} else {
# else keep the family
HS6_temp$HS6plus[HS6_temp$CN8plus == fams[i]] <- rep(fams[i], times = nrow(eval(parse(text = "block"))))
}
}
}
CN8_over_time$HS6plus[CN8_over_time$family == 1] <- HS6_temp$HS6plus
}
if (progress) {
print(paste0("Work in progress... Part ", mod_partb + 1, "/" , mod_parte, ": 0%"))
}
# get BEC and BEC_agr for HS6plus
namescol <- c("BEC","BEC_agr")
CN8_to_BEC_unique <- CN8_to_BEC[!duplicated(CN8_to_BEC$HS6),2:4]
CN8_over_time$sort <- seq(1:nrow(CN8_over_time))
temp <- merge(CN8_over_time[CN8_over_time$family == 1,!colnames(CN8_over_time) %in% namescol],
CN8_to_BEC_unique, by.x = "HS6plus", by.y = "HS6", all.x=TRUE, sort = TRUE)
temp <- temp[order(temp$sort),]
CN8_over_time$BEC[CN8_over_time$family == 1] <- temp$BEC
CN8_over_time$BEC_agr[CN8_over_time$family == 1] <- temp$BEC_agr
rm(temp)
CN8_over_time$family <- NULL
CN8_over_time$sort <- NULL
CN8_over_time$HS6 <- NULL
CN8_over_time$new_code <- NULL
#############################################
# define SNA
if(e < 2012) {
CN8_over_time$SNA_basic_class <- CN8_over_time$BEC
CN8_over_time$SNA_basic_class[which(CN8_over_time$SNA_basic_class %in% c("41", "521"))] <- "Capital good"
CN8_over_time$SNA_basic_class[CN8_over_time$SNA_basic_class %in% c("111", "121", "21",
"22", "31", "322",
"42", "53")] <- "Intermediate good"
CN8_over_time$SNA_basic_class[CN8_over_time$SNA_basic_class %in% c("112", "122", "522",
"61", "62", "63")] <- "Consumption good"
CN8_over_time$SNA_basic_class[CN8_over_time$SNA_basic_class %in% c("51", "321", "7")] <- "not_defined"
} else {
CN8_over_time$SNA_basic_class <- CN8_over_time$BEC
CN8_over_time$SNA_basic_class[which(CN8_over_time$SNA_basic_class %in% c("112", "112010", "112020",
"212", "212010", "212020",
"312", "312010", "312020",
"412", "412010", "412020",
"512", "512010", "512020",
"612", "612010", "612020",
"712", "712010", "712020",
"812", "812010", "812020"))] <- "Gross Fixed Capital Formation"
CN8_over_time$SNA_basic_class[CN8_over_time$SNA_basic_class %in% c("111", "1111", "1112", "111210", "111220", "121", "121010", "121020",
"211", "2111", "2112", "211210", "211220", "221", "221010", "221020",
"311", "3111", "3112", "311210", "311220", "321", "321010", "321020",
"411", "4111", "4112", "411210", "411220", "421", "421010", "421020",
"511", "5111", "5112", "511210", "511220", "521", "521010", "521020",
"611", "6111", "6112", "611210", "611220", "621", "621010", "621020",
"711", "7111", "7112", "711210", "711220", "721", "721010", "721020",
"811", "8111", "8112", "811210", "811220", "821", "821010", "821020")] <- "Intermediate Consumption"
CN8_over_time$SNA_basic_class[CN8_over_time$SNA_basic_class %in% c("113", "1131", "113101", "113102", "1132", "113201", "113202", "123",
"213", "2131", "213101", "213102", "2132", "213201", "213202", "223",
"313", "3131", "313101", "313102", "3132", "313201", "313202", "323",
"413", "4131", "413101", "413102", "4132", "413201", "413202", "423",
"513", "5131", "513101", "513102", "5132", "513201", "513202", "523",
"613", "6131", "613101", "613102", "6132", "613201", "613202", "623",
"713", "7131", "713101", "713102", "7132", "713201", "713202", "723",
"813", "8131", "813101", "813102", "8132", "813201", "813202", "823")] <- "Final Consumption"
}
# remove CN8plus, HS6plus, BEC and SNA if not harmonizeable, i.e. NA in year harm_year
idx_na_e <- which(is.na(eval(parse(text = paste0("CN8_over_time$CN8_", harm_year)))))
CN8_over_time$CN8plus[idx_na_e] <- NA
CN8_over_time$HS6plus[idx_na_e] <- NA
CN8_over_time$BEC[idx_na_e] <- NA
CN8_over_time$BEC_agr[idx_na_e] <- NA
CN8_over_time$SNA_basic_class[idx_na_e] <- NA
### redefine family names
# find unique families
uniq_f <- unique(CN8_over_time$CN8plus[grep("f_", CN8_over_time$CN8plus)])
for(i in 1:length(uniq_f)) {
# replace each CN8plus family by new name
CN8_over_time$CN8plus[which(CN8_over_time$CN8plus == uniq_f[i])] <- paste0("f", i)
# replace each HS6plus family by new name
CN8_over_time$HS6plus[which(CN8_over_time$HS6plus == uniq_f[i])] <- paste0("f", i)
}
CN8_over_time <- CN8_over_time[!duplicated(CN8_over_time),]
CN8_over_time$SNA <- CN8_over_time$SNA_basic_class
CN8_over_time$SNA_basic_class <- NULL
if (progress) {
print(paste0("Work in progress... Part ", mod_partb + 1, "/" , mod_parte, ": 50%"))
}
### add flag == 3
# flag == 3 indicates that at least one simple change
# Two arguments must hold in order to set flag == 3
# 1) the code is not associated with a family
# 2) the code in the first year (b) has to be different from the last year (e)
# get all codes not associated with a family
flag_rows_f <- grep("f", CN8_over_time$CN8plus, invert = TRUE)
# get all rows where first and last year codes differ
flag_rows_d <- which(CN8_over_time[[2]] != CN8_over_time[[e - b + 2]])
# check for which rows both conditions hold
flag_rows <- intersect(flag_rows_f, flag_rows_d)
## only continue if any cases exist
if(any(flag_rows)) {
# set flag == 3
CN8_over_time$flag[flag_rows] <- 3
## find flag years
flag_cols <- apply(CN8_over_time[flag_rows, 2:(e - b + 2)], 1, FUN = function(x) x[1] != x, simplify = FALSE)
# find cols where the code changed
flag_cols <- lapply(flag_cols, which)
# take smallest element, i.e. first year where the code changed
flag_cols <- lapply(flag_cols, min)
# transform list into vector
flag_cols <- unlist(flag_cols)
# add1 one in order to fit again with CN_over_time (first col is plus code)
flag_cols <- flag_cols + 1
# get colnames and therefore years
flag_cols <- as.numeric(substr(colnames(CN8_over_time[flag_cols]), start = 5, stop = 8))
## set flag years
CN8_over_time$flagyear[flag_rows] <- flag_cols
}
if (progress) {
print(paste0("Work in progress... Part ", mod_partb + 1, "/" , mod_parte, ": 100%"))
}
return(CN8_over_time)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.