Nothing
############################################################################
### Apply Harmonization & Working Paper Graphs/Analyses & New/Dropped Product Data
### 16-11-2021
### Christoph Baumgartner & Janette Walde
############################################################################
#' @importFrom stats na.omit
#' @importFrom methods is
#' @export utilize_cn8
utilize_cn8 <- function(b, e, firm_data, harmonized_data = NULL,
progress = TRUE, output = "merged.firm.data",
value = FALSE, base = "CN8") {
#########################
### check variable "output"
#########################
if(!output %in% c("product.changes", "merged.firm.data", "all")) {
stop(paste0("'", output, "' is not a valid input for 'output'."))
}
#########################
### check variable "base"
#########################
if(!base %in% c("CN8", "HS6")) {
stop(paste0("'", base, "' is not a valid input for 'base'."))
}
#########################
### firm data check
#########################
if (!any(grep("firmID", colnames(firm_data)))) {
stop("The entered firm data does not contain a column named 'firmID'. Please correct.")
}
if (!any(grep("year", colnames(firm_data)))) {
stop("The entered firm data does not contain a column named 'year'. Please correct.")
}
if (!any(grep("CN8", colnames(firm_data)))) {
stop("The entered firm data does not contain a column named 'CN8'. Please correct.")
}
if (!is(firm_data$CN8, "character")) {
stop("The column 'CN8' has a wrong class. Please convert to 'character'.")
}
### check value column if needed
if (value) {
if (!any(grep("value", colnames(firm_data)))) {
stop("The entered firm data does not contain a column named 'value'. Please correct.")
}
firm_data$value <- as.numeric(firm_data$value)
}
firm_data <- firm_data
firm_data$year <- as.integer(firm_data$year)
firm_data$CN8 <- gsub(".", "", firm_data$CN8, fixed = TRUE)
firm_data$CN8 <- gsub(" ", "", firm_data$CN8, fixed = TRUE)
#########################
### define basic variables
#########################
years <- min(firm_data$year, na.rm = TRUE):max(firm_data$year, na.rm = TRUE)
nbr_years <- length(years)
#########################
### check if firm data fits input years
#########################
if (min(years) != b | max(years) != e) {
stop("The entered years of interest do not fit the firm data. Please correct.")
}
#########################
### harmonized data check
#########################
if (is.null(harmonized_data)) {
CN8_harm <- harmonize_cn8(b = b, e = e, progress = progress)
mod_part <- 5
} else {
if (!is.data.frame(harmonized_data)) {
stop("The entered harmonized data is not a dataframe. Please correct.")
}
if (!any(grep("CN8_", colnames(harmonized_data)))) {
stop("Column names of the harmonized data are not correct. Please correct.")
}
if (ncol(harmonized_data) != (e - b + 8)) {
stop(paste0("The entered harmonized data has the wrong number of columns. It has ", ncol(harmonized_data), ", while ", (e - b + 8), " are needed for the entered time period. Please correct."))
}
if (length(grep("CN8_", colnames(harmonized_data))) != (e - b + 1)) {
stop("The entered harmonized data does not have enough columns which contain CN8 codes. It has ", length(grep("CN8_", colnames(harmonized_data))), " while ", (e - b + 1), " are needed for the entered time period. Please correct.")
}
if (suppressWarnings(!all(seq(from = b, to = e) == as.integer(substr(colnames(harmonized_data[grep("CN8_", colnames(harmonized_data))]), start = 5, stop = 8))))) {
stop("The entered harmonized data does not match the given time period. Please correct.")
}
if (suppressWarnings(!all(years == as.integer(substr(colnames(harmonized_data[grep("CN8_", colnames(harmonized_data))]), start = 5, stop = 8))))) {
stop("The entered harmonized data does not match the given firm data time period. Please correct.")
}
CN8_harm <- harmonized_data
mod_part <- 1
}
#########################
### harmonize data
#########################
current_df <- as.matrix(CN8_harm[, paste0("CN8_", years)])
temp1 <- subset(CN8_harm, select = c("CN8plus", "flag","flagyear",
"HS6plus", "BEC",
"BEC_agr", "SNA"))
# merge firm data with harmonized data
get_harm <- function(x) {
ifelse(length(which(current_df == x)) > 0,
{# col and row in which the code appears first
idx <- which(current_df == x, arr.ind=T)
temp <- temp1[idx[1,1], ]},
{temp <- rep(NA, ncol(temp1))})
return(temp)
}
firm_data_list <- lapply(firm_data$CN8, get_harm)
unlistindex <- diag(ncol(temp1))
unlistindex <- ifelse(unlistindex == 1, TRUE, FALSE)
firm_data$CN8plus <- unlist(firm_data_list)[unlistindex[1,]]
firm_data$flag <- unlist(firm_data_list)[unlistindex[2,]]
firm_data$flagyear <- unlist(firm_data_list)[unlistindex[3,]]
firm_data$HS6plus <- unlist(firm_data_list)[unlistindex[4,]]
firm_data$BEC <- unlist(firm_data_list)[unlistindex[5,]]
firm_data$BEC_agr <- unlist(firm_data_list)[unlistindex[6,]]
firm_data$SNA <- unlist(firm_data_list)[unlistindex[7,]]
### define output
if (output == "merged.firm.data"){
if (progress) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 0%"))
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 100%"))
}
return(firm_data)
} else {
################################
### get all family codes (= syn)
################################
firms <- firm_data[order(firm_data$firmID, firm_data$year),]
if (value) {
###########################################################
### main function with value
###########################################################
dataNP <- data.frame(matrix(NA, nrow(firms), 12))
colnames(dataNP) <- c("firmID", "year", "period", "gap", "same_products", "value_same_products",
"new_products", "value_new_products", "dropped_products",
"value_dropped_products", "number_of_products_t", "number_of_products_tplusone")
if (progress) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 0%"))
}
temp <- 0
year_matrix <- cbind(c(1:(nbr_years - 1)), years[-1])
firm_lvls <- levels(as.factor(firms$firmID))
nbr_firms <- length(firm_lvls)
# Loop over all firms to get new/changed number of products
for (i in 1:nbr_firms){
firm <- firm_lvls[i]
datafirm <- firms[firms$firmID == firm, ]
year_lvls <- levels(as.factor(datafirm$year))
# firms which only exist in one year
if (length(year_lvls) <= 1) {next}
# Get all years sequentially to identify the changes in the product codes
for (j in 1:(length(year_lvls) - 1)){
temp <- temp + 1
dataNP$firmID[temp] <- datafirm$firmID[1]
year <- year_lvls[j]
nextyear <- year_lvls[j + 1]
# check if a gap (= year difference > 1) exists
if (as.integer(nextyear) - as.integer(year) != 1) {dataNP$gap[temp] <- 1}
else {dataNP$gap[temp] <- 0}
idx_year <- year_matrix[, 2] == nextyear
dataNP$year[temp] <- nextyear
dataNP$period[temp] <- paste0(year, "-", nextyear)
# idyear = codes in year t; idnextyear = codes in year t + 1
if (base == "CN8") {
idyear <- unique(na.omit(datafirm$CN8plus[datafirm$year == year]))
idnextyear <- unique(na.omit(datafirm$CN8plus[datafirm$year == nextyear]))
} else {
idyear <- unique(na.omit(datafirm$HS6plus[datafirm$year == year]))
idnextyear <- unique(na.omit(datafirm$HS6plus[datafirm$year == nextyear]))
}
# In two consecutive years exist product codes for a specific firm
if (length(idyear) != 0 & length(idnextyear) != 0) {
dataNP$same_products[temp] <- sum(idyear %in% idnextyear)
# typos/not recognized codes in the firm data are not considered as no CN8plus exists
dataNP$dropped_products[temp] <- length(idyear) - dataNP$same_products[temp]
dataNP$new_products[temp] <- length(idnextyear) - dataNP$same_products[temp]
dataNP$number_of_products_t[temp] <- length(idyear)
dataNP$number_of_products_tplusone[temp] <- length(idnextyear)
} else if (length(idyear) != 0 & length(idnextyear) == 0) {
# In the second year of the two consecutive years no product codes exist for a specific firm
dataNP$same_products[temp] <- 0
dataNP$dropped_products[temp] <- length(idyear)
dataNP$new_products[temp] <- 0
dataNP$number_of_products_t[temp] <- length(idyear)
dataNP$number_of_products_tplusone[temp] <- 0
} else if (length(idyear) == 0 & length(idnextyear) != 0) {
# In the first year of the two consecutive years no product codes exist for a specific firm
dataNP$same_products[temp] <- 0
dataNP$dropped_products[temp] <- 0
dataNP$new_products[temp] <- length(idnextyear)
dataNP$number_of_products_t[temp] <- 0
dataNP$number_of_products_tplusone[temp] <- length(idnextyear)
}
### calculate value
if (base == "CN8") {
# get index for the current year codes
idx_idyear <- sapply(idyear, FUN = function(x) {which(datafirm$CN8plus == x)}, simplify = FALSE)
# get index for the next year codes
idx_idnextyear <- sapply(idnextyear, FUN = function(x) {which(datafirm$CN8plus == x)}, simplify = FALSE)
} else {
# get index for the current year codes
idx_idyear <- sapply(idyear, FUN = function(x) {which(datafirm$HS6plus == x)}, simplify = FALSE)
# get index for the next year codes
idx_idnextyear <- sapply(idnextyear, FUN = function(x) {which(datafirm$HS6plus == x)}, simplify = FALSE)
}
# sum up all values of same products in next year
# for intersect it must hold (all codes that remained the same & year = nextyear)
dataNP$value_same_products[temp] <- sum(datafirm$value[intersect(unlist(idx_idnextyear[idyear[idyear %in% idnextyear]]), which(datafirm$year == nextyear))], na.rm = TRUE)
# sum up all values of dropped products in next year
# for intersect it must hold (all codes that are produced in t but not in t+1 & year = year)
dataNP$value_dropped_products[temp] <- sum(datafirm$value[intersect(unlist(idx_idyear[idyear[!idyear %in% idnextyear]]), which(datafirm$year == year))], na.rm = TRUE)
# sum up all values of new products in next year
# for intersect it must hold (all codes that are produced in t+1 but not in t & year = nextyear)
dataNP$value_new_products[temp] <- sum(datafirm$value[intersect(unlist(idx_idnextyear[idnextyear[!idnextyear %in% idyear]]), which(datafirm$year == nextyear))], na.rm = TRUE)
}
if (progress) {
if (i %% 200 == 0) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": ", round(i/nbr_firms, 3) * 100, "%"))
}
}
}
dataNP <- dataNP[1:temp,]
if (progress) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 100%"))
}
colnames(dataNP) <- c("firmID", "period_UL", "period", "gap", "same_products", "value_same_products",
"new_products", "value_new_products", "dropped_products",
"value_dropped_products", "nbr_of_products_period_LL", "nbr_of_products_period_UL")
if (output == "all"){
return(list(dataNP, firm_data))
} else {
return(dataNP)
}
} else {
###########################################################
### main function without value
###########################################################
dataNP <- data.frame(matrix(NA, nrow(firms), 9))
colnames(dataNP) <- c("firmID", "year", "period", "gap", "same_products", "new_products",
"dropped_products", "number_of_products_t", "number_of_products_tplusone")
if (progress) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 0%"))
}
temp <- 0
year_matrix <- cbind(c(1:(nbr_years - 1)), years[-1])
firm_lvls <- levels(as.factor(firms$firmID))
nbr_firms <- length(firm_lvls)
# Loop over all firms to get new/changed number of products
for (i in 1:nbr_firms){
firm <- firm_lvls[i]
datafirm <- firms[firms$firmID == firm, ]
year_lvls <- levels(as.factor(datafirm$year))
# firms which only exist in one year
if (length(year_lvls) <= 1) {next}
# Get all years sequentially to identify the changes in the product codes
for (j in 1:(length(year_lvls) - 1)){
temp <- temp + 1
dataNP$firmID[temp] <- datafirm$firmID[1]
year <- year_lvls[j]
nextyear <- year_lvls[j + 1]
# check if a gap (= year difference > 1) exists
if (as.integer(nextyear) - as.integer(year) != 1) {dataNP$gap[temp] <- 1}
else {dataNP$gap[temp] <- 0}
idx_year <- year_matrix[, 2] == nextyear
dataNP$year[temp] <- nextyear
dataNP$period[temp] <- paste0(year, "-", nextyear)
# idyear = codes in year t; idnextyear = codes in year t + 1
if (base == "CN8") {
idyear <- unique(na.omit(datafirm$CN8plus[datafirm$year == year]))
idnextyear <- unique(na.omit(datafirm$CN8plus[datafirm$year == nextyear]))
} else {
idyear <- unique(na.omit(datafirm$HS6plus[datafirm$year == year]))
idnextyear <- unique(na.omit(datafirm$HS6plus[datafirm$year == nextyear]))
}
# In two consecutive years exist product codes for a specific firm
if (length(idyear) != 0 & length(idnextyear) != 0) {
dataNP$same_products[temp] <- sum(idyear %in% idnextyear)
# typos/not recognized codes in the firm data are not considered as no CN8plus exists
dataNP$dropped_products[temp] <- length(idyear) - dataNP$same_products[temp]
dataNP$new_products[temp] <- length(idnextyear) - dataNP$same_products[temp]
dataNP$number_of_products_t[temp] <- length(idyear)
dataNP$number_of_products_tplusone[temp] <- length(idnextyear)
} else if (length(idyear) != 0 & length(idnextyear) == 0) {
# In the second year of the two consecutive years no product codes exist for a specific firm
dataNP$same_products[temp] <- 0
dataNP$dropped_products[temp] <- length(idyear)
dataNP$new_products[temp] <- 0
dataNP$number_of_products_t[temp] <- length(idyear)
dataNP$number_of_products_tplusone[temp] <- 0
} else if (length(idyear) == 0 & length(idnextyear) != 0) {
# In the first year of the two consecutive years no product codes exist for a specific firm
dataNP$same_products[temp] <- 0
dataNP$dropped_products[temp] <- 0
dataNP$new_products[temp] <- length(idnextyear)
dataNP$number_of_products_t[temp] <- 0
dataNP$number_of_products_tplusone[temp] <- length(idnextyear)
}
}
if (progress) {
if (i %% 200 == 0) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": ", round(i/nbr_firms, 3) * 100, "%"))
}
}
}
dataNP <- dataNP[1:temp,]
if (progress) {
print(paste0("Work in progress... Part ", mod_part, "/", mod_part,": 100%"))
}
colnames(dataNP) <- c("firmID", "period_UL", "period", "gap", "same_products", "new_products",
"dropped_products","nbr_of_products_period_LL", "nbr_of_products_period_UL")
if (output == "all"){
return(list(dataNP, firm_data))
} else {
return(dataNP)
}
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.