#This file includes all wrapper functions called during the Estimation from Estimate_Data.R
# --- INSTALLING AND CALLING PACKAGES ----
#' Get package dependencies
#' @param packs A string vector of package names
#' @return A string vector with packs together with names of any dependencies
#' @export
Do.Dependencies <- function(packs){
dependencyNames <- unlist(
tools::package_dependencies(packages = packs, db = available.packages(),
which = c("Depends", "Imports"),
recursive = TRUE))
packageNames <- union(packs, dependencyNames)
packageNames
}
#' Make package repository
#' @param directory_Pkg A string vector of name directory_Pkg: where packages are to be stored
#' @return A string vector with packs
#' @export
Do.pkg.repository <- function(directory_Pkg) {
#Script to download all packages to a local repository to install it later
# Calculate dependencies
sysd<-.Platform$pkgType
pkgs1<-c("spData", "e1071", "classInt", "labelled", "questionr", "devtools", "futile.logger", "knitr", "kableExtra", "data.table", "reshape2", "plyr", "stringr", "ggm", "FactoClass", "questionr", "gtools", "sparsesvd", "SparseM", "SDMTools", "classInt", "questionr") # pkgs1<-c("devtools", "kableExtra", "roxygen2", "knitr")
packages_rblm=c("Matrix","SparseM", "gtools", "reshape", "splines","quantreg",
"ggplot2", "nnls", "data.table", "FactoClass", "SDMTools", "lpSolve",
"reshape2", "stringr", "limSolve", "MASS", "quadprog",
"gridExtra", "foreign", "futile.logger", "Hmisc",
"plyr", "Rmpi", "snow","tidyr")
#On Linux, xclip required for questionr
if (sysd=="source") {
pkgsl<-c(pkgs1, "xclip")
}
packages_all=c(pkgs1, packages_rblm)
packages_all=unique(packages_all)
packages <- Do.Dependencies(packages_all)
#Download the packages and save csv File that contains name of them
setwd(directory_Pkg)
pkgInfo <- download.packages(pkgs = packages, destdir = getwd(), type = sysd)
write.csv(file = "pkgFilenames.csv", basename(pkgInfo[, 2]), row.names = FALSE)
}
#' Read names of required packages: used whenever csv file cannot be read
#'
#' @return A string vector with packs
#' @export
Do.names.pkgs<-function() {
pks<-c("devtools_1.13.5.zip","futile.logger_1.4.3.zip","knitr_1.20.zip","kableExtra_0.9.0.zip","data.table_1.11.4.zip",
"reshape2_1.4.3.zip","plyr_1.8.4.zip","stringr_1.3.1.zip","ggm_2.3.zip","FactoClass_1.2.4.zip","questionr_0.6.2.zip",
"gtools_3.5.0.zip","sparsesvd_0.1-4.zip","SparseM_1.77.zip","SDMTools_1.1-221.zip","Matrix_1.2-14.zip","reshape_0.8.7.zip",
"quantreg_5.36.zip","ggplot2_2.2.1.zip","nnls_1.4.zip","lpSolve_5.6.13.zip","limSolve_1.5.5.3.zip","MASS_7.3-49.zip",
"quadprog_1.5-5.zip","gridExtra_2.3.zip","foreign_0.8-70.zip","Hmisc_4.1-1.zip","snow_0.4-2.zip",
"httr_1.3.1.zip","memoise_1.1.0.zip","whisker_0.3-2.zip","digest_0.6.15.zip","rstudioapi_0.7.zip","jsonlite_1.5.zip",
"git2r_0.21.0.zip","withr_2.1.2.zip","mime_0.5.zip","curl_3.2.zip","openssl_1.0.1.zip","R6_2.2.2.zip","lambda.r_1.2.3.zip",
"futile.options_1.0.1.zip","formatR_1.5.zip","evaluate_0.10.1.zip","highr_0.6.zip","markdown_0.8.zip","yaml_2.1.19.zip",
"glue_1.2.0.zip","magrittr_1.5.zip","stringi_1.1.7.zip","xml2_1.2.0.zip","rvest_0.3.2.zip","rmarkdown_1.9.zip",
"readr_1.1.1.zip","scales_0.5.0.zip","viridisLite_0.3.0.zip","htmltools_0.3.6.zip","Rcpp_0.12.17.zip","tibble_1.4.2.zip",
"hms_0.4.2.zip","base64enc_0.1-3.zip","rprojroot_1.3-2.zip","selectr_0.4-1.zip","RColorBrewer_1.1-2.zip","dichromat_2.0-0.zip",
"munsell_0.4.3.zip","labeling_0.3.zip","pkgconfig_2.0.1.zip","rlang_0.2.1.zip","colorspace_1.3-2.zip","backports_1.1.2.zip",
"cli_1.0.0.zip","crayon_1.3.4.zip","pillar_1.2.3.zip","assertthat_0.2.0.zip","utf8_1.1.4.zip","igraph_1.2.1.zip",
"lattice_0.20-35.zip","ade4_1.7-11.zip","ggrepel_0.8.0.zip","xtable_1.8-2.zip","scatterplot3d_0.3-41.zip",
"KernSmooth_2.23-15.zip","gtable_0.2.0.zip","lazyeval_0.2.1.zip","shiny_1.1.0.zip","miniUI_0.1.1.1.zip",
"classInt_0.2-3.zip","labelled_1.1.0.zip","spData_0.2.8.3.zip","e1071_1.6-8.zip","class_7.3-14.zip","haven_1.1.1.zip",
"dplyr_0.7.5.zip","httpuv_1.4.3.zip","sourcetools_0.1.7.zip","later_0.7.2.zip","promises_1.0.1.zip",
"bindrcpp_0.2.2.zip","tidyselect_0.2.4.zip","forcats_0.3.0.zip","bindr_0.1.1.zip","purrr_0.2.5.zip",
"R.utils_2.6.0.zip","R.oo_1.22.0.zip","R.methodsS3_1.7.1.zip","MatrixModels_0.4-1.zip","survival_2.41-3.zip",
"Formula_1.2-3.zip","latticeExtra_0.6-28.zip","cluster_2.0.7-1.zip","rpart_4.1-13.zip","nnet_7.3-12.zip",
"acepack_1.4.1.zip","htmlTable_1.12.zip","viridis_0.5.1.zip","checkmate_1.8.5.zip","htmlwidgets_1.2.zip",
"withr_2.1.0.zip", "igraph_1.1.2.zip", "ade4_1.7-8.zip", "tidyr_0.8.1.zip", "zoo_1.8-6.tar.gz", "sandwich_2.5-1.tar.gz",
"lfe_2.8-3.tar.gz", "mltools_0.3.5.tar.gz", "akm_1.0.tar.gz","sparseutils_1.0.tar.gz", "haven1.1.0.tar.gz", "RSpectra_0.16-0.tar.gz" )
#"Rmpi_0.6-7.zip"
#
}
#' Install packages locally
#' @param directory_Temp Unpacking directory
#' @param directory_Pkg A string vector of name directory_Pkg
#' @param directory_Lib Package Library folder
#' @param with_file 1: use csv file with pkgs name, otherwise read from list
#' @export
Do.install.from.local<-function(directory_Temp, directory_Pkg, directory_Lib, with_file ) {
sysd<-.Platform$pkgType
# Set working directory to the location of the package files
Sys.setenv(TMPDIR=directory_Temp)
# Read the package filenames and install
if (with_file==1) {
#from csv file
pks <- read.csv("pkgFilenames.csv", stringsAsFactors = FALSE)[, 1]
} else if (with_file==0) {
#from R code with package names, to avoid writing to file as advised
pks<-Do.names.pkgs()
}
#Get the current version of the package in the local depository
#get pks name without version #
pks_short<-substr(pks,1, regexpr("_",pks)-1)
setwd(directory_Pkg)
pks_vers<-list.files(directory_Pkg)
pks_vers_short<-substr(pks_vers,1, regexpr("_",pks_vers)-1)
#find version number in directory for each package
pks_in<-pks_short[1]
pks_ni<-1
for (pks_n in 1:length(pks_short) ) {
if (is.na(match(pks_short[pks_n],pks_vers_short ))==0) {
pks_in[pks_ni]<-pks_vers[match(pks_short[pks_n],pks_vers_short )]
pks_ni<-pks_ni+1
}
}
pkgFilenames<-paste0(directory_Pkg, pks_in)
#Install
temp_com<-paste0("TMPDIR=", directory_Temp)
install.packages(pkgFilenames, repos = NULL, type = sysd, configure.vars=temp_com, lib=directory_Lib)
}
#' Loading packages
#' @param directory_Prog Program directory
#' @param do_cluster Using clusters
#' @param directory_Lib Package Library folder
#' @export
Do.loading.pkgs<-function(directory_Prog,directory_Pkg, directory_Lib,do_cluster ) {
#Loading Packages
pks<-Do.names.pkgs()
pks_short<-substr(pks,1, regexpr("_",pks)-1)
setwd(directory_Pkg)
pks_vers<-list.files(directory_Pkg)
pks_vers_short<-substr(pks_vers,1, regexpr("_",pks_vers)-1)
#find version number in directory for each package
pks_in<-pks_short[1]
pks_ni<-1
for (pks_n in 1:length(pks_short) ) {
if (is.na(match(pks_short[pks_n],pks_vers_short ))==0) {
pks_in[pks_ni]<-pks_vers[match(pks_short[pks_n],pks_vers_short )]
pks_ni<-pks_ni+1
}
}
#Load
sapply(pks_short, require, character.only = TRUE,lib.loc=directory_Lib )
#Loading BLM by sourcing code
setwd(directory_Prog)
file.sources = list.files(pattern="*.R")
file.sources.other = list.files(pattern="0")
file.sources=file.sources[!(file.sources %in% file.sources.other) & !(file.sources %in% c("formatR", "Iabbp_Read.do"))]
file.sources=c(file.sources, "m2-mixt.r")
file.other=list.files(pattern="master")
file.sources=file.sources[!(file.sources %in% file.other)]
sapply(file.sources,source,.GlobalEnv)
# setwd(directory_Prog)
# file.sources = list.files(pattern="*.r")
# sapply(file.sources,source,.GlobalEnv)
require(parallel, lib.loc=directory_Lib)
if (do_cluster==1) {
library(Rmpi) }
#in DE sometimes probelamtic
require(ggm, lib.loc=directory_Lib) # problem in DE comp
require(FactoClass, lib.loc=directory_Lib) # problem in DE comp
require(futile.logger, lib.loc=directory_Lib) # problem in DE comp
#require(akm, lib.loc=directory_Lib) #not possible in DE
require(tidyr, lib.loc=directory_Lib)
}
# --- READ AND SUMMARIZE DATA ----
#' Read data
#' @param directory_Data Folder with data
#' @param filtering if put floor on wages and ceiling on abs. wage growth
#' @param data_suffix "" if standard analysis "_all" if time analysis
#' @export
Do.reading.data <- function(directory_Data, filtering=0, data_suffix="",data_type="csv", other_vars=0, floor=opts$loading$floor) {
#Read in data
if (data_type=="csv") {
sdata <- fread(paste0(directory_Data, "sdata", data_suffix, ".csv"), colClasses="numeric")
jdata <- fread(paste0(directory_Data, "jdata", data_suffix, ".csv"), colClasses="numeric")
} else {
load(paste0(directory_Data, "sdata", data_suffix, ".dat"))
load(paste0(directory_Data, "jdata", data_suffix, ".dat"))
}
#Additions and Changes
sdata$cod_pgr<-as.character(sdata$cod_pgr) #Worker TYpe
sdata$f1<-as.character(sdata$f1) #Firm TYpe
sdata$f2 <- (sdata$f1)
jdata$cod_pgr<-as.character(jdata$cod_pgr) #Worker TYpe
jdata$f1<-as.character(jdata$f1) #Firm TYpe
jdata$f2<-as.character(jdata$f2) #Firm TYpe
sdata$x<-1
jdata$x=1
if (other_vars==0) {
jdata = jdata[,list(cod_pgr,anno,y1,y2,f1,f2,age,serv,manuf,ret_trade,cons,edu_belowsecondary,edu_secondary,edu_tertiary)]
sdata = sdata[,list(cod_pgr,anno,y1,y2,f1,f2,age,serv,manuf,ret_trade,cons,edu_belowsecondary,edu_secondary,edu_tertiary)]
}
#Firm Size
tdata<-data.table(rbind(sdata,jdata, fill=TRUE))
if ("N_j" %in% colnames(tdata) ){
tdata<- tdata[,!"N_j",with=FALSE]
sdata<- sdata[,!"N_j",with=FALSE]
jdata<- jdata[,!"N_j",with=FALSE]
}
Ns_dat<-tdata[,list(N_j=.N),by=.(f1, anno)]
setkeyv(Ns_dat, c("f1", "anno"))
jdata[Ns_dat,N_j:=i.N_j, on=c("f1","anno")]
sdata[Ns_dat,N_j:=i.N_j, on=c("f1", "anno")]
if (filtering==1) {
#2) Filtering: 5 EUR
sdata<-sdata[y1>floor][y2>floor]
jdata<-jdata[y1>floor][y2>floor]
#Wage change more than 1 in abs
# sdata<-sdata[abs(y2-y1)<1]
# jdata<-jdata[abs(y2-y1)<1]
}
#In case Do.residualize is not performed (otherwise overwritten)
sdata$y1_l<-0
jdata$y1_l<-0
#Size Data Set
Ns<-nrow(sdata)
Nm<-nrow(jdata)
stayer_share = sum(Ns)/(sum(Ns)+sum(Nm))
flog.info("reading data Ns=%i, Nm=%i , stayer_share=%2f",Ns,Nm,stayer_share);
# combine the movers and stayers, ad stands for all data:
ad = list(sdata=sdata,jdata=jdata)
}
# Program to residualize data
#' @param ad Data
#' @param residualize If to residualize or not
#' @export
Do.residualize<-function(ad, residualize=1) {
if (residualize==1) {
ad$sdata$age2<-ad$sdata$age*ad$sdata$age
ad$jdata$age2<-ad$jdata$age*ad$jdata$age
# ad$sdata$ten2<-ad$sdata$ten*ad$sdata$ten
# ad$jdata$ten2<-ad$jdata$ten*ad$jdata$ten
if (length(unique(ad$sdata$anno))==1) {
#First Period
lM <- lm(y1 ~ 1+age+age2, data=ad$sdata)
ad$sdata$ry1<-ad$sdata$y1-predict(lM)
ad$jdata$ry1<-ad$jdata$y1-predict(lM, ad$jdata)
ad$jdata$panno<-ad$jdata$anno+1
ad$sdata$panno<-ad$sdata$anno+1
#Second period
lM <- lm(y2 ~ 1+age+age2, data=ad$sdata)
ad$sdata$ry2<-ad$sdata$y2-predict(lM)
ad$jdata$ry2<-ad$jdata$y2-predict(lM, ad$jdata)
} else if (length(unique(ad$sdata$anno))>1) {
#First Period
lM <- lm(y1 ~ factor(anno)+age+age2, data=ad$sdata)
ad$sdata$ry1<-ad$sdata$y1-predict(lM)
ad$jdata$ry1<-ad$jdata$y1-predict(lM, ad$jdata)
ad$jdata$panno<-ad$jdata$anno+1
ad$sdata$panno<-ad$sdata$anno+1
#Second period
lM <- lm(y2 ~ factor(anno)+age+age2, data=ad$sdata)
ad$sdata$ry2<-ad$sdata$y2-predict(lM)
ad$jdata$ry2<-ad$jdata$y2-predict(lM, ad$jdata)
}
#Name adjustement
ad$sdata$y1_l<-ad$sdata$y1
ad$jdata$y1_l<-ad$jdata$y1
ad$jdata$y1<-ad$jdata$ry1
ad$jdata$y2<-ad$jdata$ry2
ad$sdata$y1<-ad$sdata$ry1
ad$sdata$y2<-ad$sdata$ry2
} else if (residualize==0) {
ad$sdata$y1_l<-0
ad$jdata$y1_l<-0
}
return(ad)
}
#' Do Event Study
#'
#' @param ad Data
#' @param reporting reporting wether to save to file or print only on screen ( & into Log file)
#' @export
Do.event.study<-function(ad, reporting, opts_min=opts$minN){
#Event Type
#mean wage by firm
invisible(ad$jdata[,m_l1:=mean(y1),by=f1])
invisible(ad$jdata[,m_l2:=mean(y2),by=f1])
#quartiles
ms_q = grouping.getMeasures(ad,"ecdf",Nw=20,y_var = "y1")
grps_q = grouping.classify.once(ms_q,k = 4,nstart = 1000,iter.max = 200,step=250)
ad = grouping.append(ad,grps_q$best_cluster,drop=T)
#summaries
ms<-mean(rbind(ad$sdata$y1_l, ad$jdata$y1_l)) #adding left-out mean
if (reporting==0) {
ss2<-ad$jdata[,list("w_1"=mean(m_l1)+ms, "w_2"=mean(m_l2)+ms, "N_w"=length(unique(cod_pgr, na.rm=T)),"N_f"=length(unique(f1, na.rm=T))),by=.(j1,j2)]
} else if (reporting!=0) {
ss2<-ad$jdata[,list("w_1"=mean(m_l1)+ms, "w_2"=mean(m_l2)+ms),by=.(j1,j2)]
}
s2<-data.table(j1=rep(c(1,4),2), j2=sort(rep(c(1,4),2)))
setkeyv(ss2,c("j1","j2"))
setkeyv(s2,c("j1","j2"))
s2[ss2,w_1:=i.w_1]
s2[ss2,w_2:=i.w_2]
if (reporting==0) {
s2[ss2,N_f:=i.N_f]
s2[ss2,N_w:=i.N_w]
}
s2$name<-paste0(s2$j1, " to ", s2$j2)
#Deleting if less than the required number of observations for confidentiality
if (reporting==0) {
s2$w_1[s2$N_w<opts_min]<-NaN
s2$w_2[s2$N_w<opts_min]<-NaN
}
s2<-t(s2[,3:ncol(s2)])
return(s2)
}
#' Do Summary Kline
#'
#' @param sim Data
#' @param reporting wether to save to file or print only on screen ( & into Log file)
#' @param country name to save results
#' @export
Do.summary.Kline<-function(sim, reporting, country) {
#1) Summary Table for Connected Set
# extract connected set
f0s = get.largest.conset.fid(sim$jdata)
sim$jdata = sim$jdata[f1%in%f0s][f2%in%f0s]
sim$sdata = sim$sdata[f1%in%f0s]
# Reshape as single data vector
y1_all<-c(sim$jdata$y1, sim$sdata$y1,sim$jdata$y2, sim$sdata$y2 )
f1_all<-c(sim$jdata$f1, sim$sdata$f1,sim$jdata$f2, sim$sdata$f2 )
Bel_all<-c(sim$jdata$Bel1, sim$sdata$Bel1, sim$jdata$Bel2, sim$sdata$Bel2 )
cod_all <-c(sim$jdata$cod_pgr, sim$sdata$cod_pgr, sim$jdata$cod_pgr, sim$sdata$cod_pgr )
fullt_all<-c(sim$jdata$fullt, sim$sdata$fullt, sim$jdata$fullt, sim$sdata$fullt)
fullt_all[is.na(fullt_all)]<-0
mov_all<-c(((sim$jdata$y1)*0+1),(sim$sdata$y1*0),((sim$jdata$y2)*0+1),(sim$sdata$y2*0) )
sms = list(y1_all=y1_all, f1_all=f1_all, Bel_all=Bel_all, cod_all=cod_all, mov_all=mov_all, fullt_all=fullt_all)
setDT(sms)
ss<-t(sms[,list("number of observations"=length(y1_all),
"number of movers"=length(unique(cod_all[mov_all==1])),
"number of firms"=length(unique(f1_all)),
"share of full-time observations"=round(mean(fullt_all)*100*10^2,digits=0)/10^2,
"mean log-earnings"=round(mean(y1_all)*10^4, digits=0)/10^4,
"variance of log-earnings"=round(sd(y1_all)^2*10^4, digits=0)/10^4),
by=Bel_all][order(Bel_all)])
sa<-t(sms[,list("numbder of observations"=length(y1_all),
"number of movers"=length(unique(cod_all[mov_all==1])),
"number of firms"=length(unique(f1_all, na.rm=T)),
"share of full-time observations"=round(mean(fullt_all)*100*10^2,digits=0)/10^2,
"mean log-earnings"=round(mean(y1_all)*10^4, digits=0)/10^4,
"variance of log-earnings"=round(sd(y1_all)^2*10^4, digits=0)/10^4)])
ss<-ss[2:(length(sa)+1),]
sx<-cbind(ss,sa)
#2) Summary Table for Leave-out Set
# extract leave out firms
jids_con = get.largest.leaveoutset.fid(sim$jdata)
jidscon<-unique(c(jids_con$f1,jids_con$f2))
sim$jdata<-sim$jdata[( (sim$jdata$f1 %in% jidscon)) ]
sim$sdata<-sim$sdata[( (sim$sdata$f1 %in% jidscon)) ]
# Reshape as single data vector
y1_all<-c(sim$jdata$y1, sim$sdata$y1,sim$jdata$y2, sim$sdata$y2 )
f1_all<-c(sim$jdata$f1, sim$sdata$f1,sim$jdata$f2, sim$sdata$f2 )
Bel_all<-c(sim$jdata$Bel1, sim$sdata$Bel1, sim$jdata$Bel2, sim$sdata$Bel2 )
cod_all <-c(sim$jdata$cod_pgr, sim$sdata$cod_pgr, sim$jdata$cod_pgr, sim$sdata$cod_pgr )
fullt_all<-c(sim$jdata$fullt, sim$sdata$fullt, sim$jdata$fullt, sim$sdata$fullt)
fullt_all[is.na(fullt_all)]<-0
mov_all<-c(((sim$jdata$y1)*0+1),(sim$sdata$y1*0),((sim$jdata$y2)*0+1),(sim$sdata$y2*0) )
sms = list(y1_all=y1_all, f1_all=f1_all, Bel_all=Bel_all, cod_all=cod_all, mov_all=mov_all, fullt_all=fullt_all)
setDT(sms)
ss<-t(sms[,list("number of observations"=length(y1_all),
"number of movers"=length(unique(cod_all[mov_all==1])),
"number of firms"=length(unique(f1_all)),
"share of full-time observations"=round(mean(fullt_all)*100*10^2,digits=0)/10^2,
"mean log-earnings"=round(mean(y1_all)*10^4, digits=0)/10^4,
"variance of log-earnings"=round(sd(y1_all)^2*10^4, digits=0)/10^4),
by=Bel_all][order(Bel_all)])
sa<-t(sms[,list("numbder of observations"=length(y1_all),
"number of movers"=length(unique(cod_all[mov_all==1])),
"number of firms"=length(unique(f1_all, na.rm=T)),
"share of full-time observations"=round(mean(fullt_all)*100*10^2,digits=0)/10^2,
"mean log-earnings"=round(mean(y1_all)*10^4, digits=0)/10^4,
"variance of log-earnings"=round(sd(y1_all)^2*10^4, digits=0)/10^4)])
ss<-ss[2:(length(sa)+1),]
sx_L<-cbind(ss,sa)
#join both tables
sT<-rbind(sx, sx_L)
#Formatting and exporting of table
#thousands separator
sepdat<-matrix(c(rep(1,(3*1)), rep(0, (3*1)),rep(1,(3*1)), rep(0, (3*1)) ))
spdat<-matrix(sepdat,nrow=12,ncol=ncol(sT))
sT[spdat==1]<-prettyNum(sT[spdat==1], big.mark = ",")
colnames(sT)<-cbind("Rovigo", "Belluno","Rovigo-Belluno")
return(sT)
}
#' Do Summary Stats
#'
#' @param ad Data
#' @param reporting wether to save to file or print only on screen ( & into Log file)
#' @param country name to save results
#' @export
Do.summary.stats<-function(ad, reporting, opts_min=opts$minN) {
sx<-Do.compute.stats(ad, groupi="j1", reporting, opts_min)
#Exporting
colnames(sx)<-cbind("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "all")
return(sx)
}
#' Do Compare Leave-out and Left-out Sets
#'
#' @param sim Data
#' @param directory_Res directory for Results
#' @param country name to save results
#' @export
Do.compare.sets<-function(sim, reporting, opts_min=opts$minN){
sim.copy<-copy(sim)
if (reporting==0){
invisible(Info_S<-"This produces the summary statistics by leave-out and left-out set and for all firms combined.
Table 1): Describtive Statistics
The rows describe the contained information, where the first row describes the set (leave-out or left-out).
FOR CONFIDENTIALITY TEST:
In this table, results computed with less than the required min. number of obs. are denoted as NaN for confidentiality.")
flog.info(Info_S)
}
#1)full data summary
sim$jdata$mov<-1
sim$sdata$mov<-0
sx_full<-Do.compute.stats(sim, groupi="mov", reporting, opts_min)
#2) connected set
# extract connected set
f0s = get.largest.conset.fid(sim$jdata)
sim$jdata = sim$jdata[f1%in%f0s][f2%in%f0s]
sim$sdata = sim$sdata[f1%in%f0s]
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
sx_connect<-Do.compute.stats(sim, groupi="mov", reporting, opts_min)
#3) leave out firms
sim<-copy(sim.copy)
sim$jdata$mov<-1
sim$sdata$mov<-0
jids_con = get.largest.leaveoutset.fid(sim$jdata)
jidscon<-unique(c(jids_con$f1,jids_con$f2))
sim$jdata = sim$jdata[f1%in%jidscon][f2%in%jidscon]
sim$sdata = sim$sdata[f1%in%jidscon]
sx_leave_out<-Do.compute.stats(sim, groupi="mov", reporting, opts_min)
colnames(sx_full)<-c( "Stayers", "Movers", "All")
colnames(sx_connect)<-c("Stayers", "Movers", "All")
colnames(sx_leave_out)<-c("Stayers", "Movers", "All")
sx<-list()
sx$full<-sx_full
sx$connect<-sx_connect
sx$leave_out<-sx_leave_out
return(sx)
}
#' Do Compare Leave-out and Left-out Sets
#'
#' @param sim Data
#' @param directory_Res directory for Results
#' @param country name to save results
#' @export
Do.compare.sets.movers<-function(sim, reporting, opts_min=opts$minN){
sim.copy<-copy(sim)
if (reporting==0){
invisible(Info_S<-"This producesmovers statistics by leave-out and left-out set and for all firms combined.
FOR CONFIDENTIALITY TEST:
In this table, results computed with less than the required min. number of obs. are denoted as NaN for confidentiality.")
flog.info(Info_S)
}
#1)full data summary
sx_full<-Do.analyze.distr(sim)
#2) connected set
# extract connected set
f0s = get.largest.conset.fid(sim$jdata)
sim$jdata = sim$jdata[f1%in%f0s][f2%in%f0s]
sim$sdata = sim$sdata[f1%in%f0s]
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
sx_connect<-Do.analyze.distr(sim)
#3) leave out firms
sim<-copy(sim.copy)
jids_con = get.largest.leaveoutset.fid(sim$jdata)
jidscon<-unique(c(jids_con$f1,jids_con$f2))
sim$jdata = sim$jdata[f1%in%jidscon][f2%in%jidscon]
sim$sdata = sim$sdata[f1%in%jidscon]
sx_leave_out<-Do.analyze.distr(sim)
sx<-list()
sx$full<-sx_full
sx$connect<-sx_connect
sx$leave_out<-sx_leave_out
return(sx)
}
#' @param ad Data
#' @export
Do.analyze.distr<-function(ad) {
tt<-ad$jdata[,list(f1=c(f1,f2))][,.N,f1][N<=10]
cum.prob<-tt[,list(N_f=.N),by=(N)][order(N)]
cum.prob$cdf_N_f<-cumsum(cum.prob$N_f)/sum(cum.prob$N_f)
return(cum.prob)
}
#' @export
Do.analyze.wage<-function(ad){
tdata<-data.table(rbind(ad$sdata,ad$jdata, fill=TRUE))
data_w<-hist(tdata$y1, breaks=15)
distr_wage<-data.table(wage=data_w$mids, cdf_wage=cumsum(data_w$counts)/sum(data_w$counts), N=data_w$counts)
return(distr_wage)
}
#' @param sim Data
#' @param group according to which group to summarize
#' @export
Do.compute.stats<-function(sim, groupi, reporting=0, opts_min=opts$minN){
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
invisible(sim$tdata[,sv:=sd(as.numeric(y1_l), na.rm=T),by=f1])
group=eval(parse(text=((paste0("sim$tdata$",groupi)))))
sa<-sim$tdata[, list("number of workers"=length(unique(as.character(cod_pgr), na.rm=T)),
"number of firms"=length(unique(as.character(f1), na.rm=T)),
"mean observed firm size"=mean(as.numeric(N_j), na.rm=T),
"number firms $\\geq$ 10"=ifelse(length(unique(as.character(f1[na.omit(N_j)>=10])))==0,0L,length(unique(as.character(f1[na.omit(N_j)>=10])))),
"number firms $\\geq$ 50"=ifelse(length(unique(as.character(f1[na.omit(N_j)>=50])))==0,0L,length(unique(as.character(f1[na.omit(N_j)>=50])))),
"% less than secondary education"=mean(as.numeric(edu_belowsecondary), na.rm=T),
"% secondary education"=mean(as.numeric(edu_secondary), na.rm=T),
"% tertiary education"=mean(as.numeric(edu_tertiary), na.rm=T),
"% workers younger than 30"=mean(as.numeric(age)<30)*100,
"% workers between 31 and 50"=mean(as.numeric(age)<50 & as.numeric(age)>=31)*100,
"% workers older than 50"=mean(as.numeric(age)>=50)*100,
"% workers in manufacturing"=mean(as.numeric(manuf), na.rm=T)*100,
"% workers in services"=mean(as.numeric(serv), na.rm=T)*100,
"% workers in retail and trade"=mean(as.numeric(ret_trade), na.rm=T)*100,
"% workers in construction"=mean(as.numeric(cons), na.rm=T)*100,
"mean log-earnings"=mean(as.numeric(y1_l), na.rm=T),
"variance of log-earnings"=sd(as.numeric(y1_l), na.rm=T)^2,
"between firm variance of log-earnings"=sd(sv, na.rm=T)^2),by=eval(group)][order(eval(group))]
ss<-sim$tdata[, list("number of workers"=length(unique(as.character(cod_pgr), na.rm=T)),
"number of firms"=length(unique(as.character(f1), na.rm=T)),
"mean observed firm size"=mean(as.numeric(N_j), na.rm=T),
"number firms $\\geq$ 10"=ifelse(length(unique(as.character(f1[na.omit(N_j)>=10])))==0,0L,length(unique(as.character(f1[na.omit(N_j)>=10])))),
"number firms $\\geq$ 50"=ifelse(length(unique(as.character(f1[na.omit(N_j)>=50])))==0,0L,length(unique(as.character(f1[na.omit(N_j)>=50])))),
"% less than secondary education"=mean(as.numeric(edu_belowsecondary), na.rm=T),
"% secondary education"=mean(as.numeric(edu_secondary), na.rm=T),
"% tertiary education"=mean(as.numeric(edu_tertiary), na.rm=T),
"% workers younger than 30"=mean(as.numeric(age)<30)*100,
"% workers between 31 and 50"=mean(as.numeric(age)<50 & as.numeric(age)>=31)*100,
"% workers older than 50"=mean(as.numeric(age)>=50)*100,
"% workers in manufacturing"=mean(as.numeric(manuf), na.rm=T)*100,
"% workers in services"=mean(as.numeric(serv), na.rm=T)*100,
"% workers in retail and trade"=mean(as.numeric(ret_trade), na.rm=T)*100,
"% workers in construction"=mean(as.numeric(cons), na.rm=T)*100,
"mean log-earnings"=mean(as.numeric(y1_l), na.rm=T),
"variance of log-earnings"=sd(as.numeric(y1_l), na.rm=T)^2,
"between firm variance of log-earnings"=sd(sv, na.rm=T)^2)]
#Filter out means with less than the required min number of observations for confidentiality
if (reporting==0){
indx_more20_ss<-matrix((ss[,1]<opts_min),nrow=nrow(ss),(ncol=ncol(ss)))
indx_more20_sa<-matrix((sa[,2]<opts_min),nrow=nrow(sa),(ncol=ncol(sa)))
ss[indx_more20_ss]<-NaN
sa[indx_more20_sa]<-NaN
}
ss<-t(ss)
sa<-t(sa)
# #filter out missing info (this is country specific)
sa<-sa[2:(nrow(sa)),]
sx<-cbind(sa,ss)
indx_i=complete.cases(sx)
sx<-sx[indx_i,]
#
# #Formatting
# #N of digits matrix
# mmdat <- matrix(c(rep(10^0, (5*1)), rep(10, (10*1)), rep(10^2,1), rep(10^3,(2*1))), nrow = 18, ncol=1, byrow=TRUE)
# mdat<-matrix(mmdat,nrow=nrow(mmdat),(ncol=ncol(sx)))
# mdat<-mdat[indx_i,]
# sx<-round(sx*mdat, digits=0)/mdat
# #matrix for adding %
# ppdat<-matrix(c(rep("",(5*1)), rep("%", (10*1)), rep("", (3*1)) ))
# pdat<-matrix(ppdat,nrow=nrow(mmdat),ncol=ncol(sx))
# pdat<-pdat[indx_i,]
# sx[,1:ncol(sx)]<-paste(sx[,1:ncol(sx)], pdat[,1:ncol(sx)], sep="", collapse=NULL)
# #thousands separator
# sepdat<-matrix(c(rep(1,(5*1)), rep(0, (13*1)) ))
# spdat<-matrix(sepdat,nrow=nrow(mmdat),ncol=ncol(sx))
# spdat<-spdat[indx_i,]
# sx[spdat==1]<-prettyNum(sx[spdat==1], big.mark = ",")
return(sx)
}
# --- COMPUTING STATS ----
#' Do Compute wage groups
#'
#' @param res2 object from estimation
#' @param ms mean data
#' @export
Do.compute.groups.w<-function(res2, ms) {
res_ql<-res2
model<-res2
qt<-6
rr = data.table(l=1:length(model$A1),Em = model$Em,Esd = model$Esd,
N = model$Ns,A1=model$A1,B1=model$B1,A2=model$A2,B2=model$B2)
alpha_m = rr[, wtd.mean(Em,N)]
alpha_sd = sqrt(rr[, wtd.mean(Esd^2,N) + wtd.var(Em,N) ])
qts = qnorm( (1:qt)/(qt+1))
rr2 = rr[, list( y1= (qts*alpha_sd + alpha_m)* B1+ A1 ,y2= (qts*alpha_sd + alpha_m)* B2+ A2, k=1:qt),l ]
rr2 = melt(rr2,id.vars = c('l','k'))
rr2$value<-rr2$value+ms
return(rr2)
}
#' Sparse colSums
#' @export
sColSums <- function(M) {
return(as.numeric((rep(1,dim(M)[1]) %*% M)))
}
#' Sparse rowSums
#' @export
sRowSums <- function(M) {
return(as.numeric(M %*% rep(1,dim(M)[2])))
}
#' @export
Do.hybrid.leaveout<-function(sizeM=15, nm_list=c(seq(1,0.1,l=10),0.05), leaveoutN=1, data_type, directory_Data, filtering, residualize, cl=NaN) {
#read in data: as muti-year data required
ad<-Do.reading.data(directory_Data, filtering, data_suffix="_all", data_type)
#keep three years of data
ad$sdata<-ad$sdata[anno>=(max(anno)-2)]
ad$jdata<-ad$jdata[anno>=(max(anno)-2)]
#residualize
ad<-Do.residualize(ad, residualize)
# get firms with at least nm movers
fids = ad$jdata[,list(f1=c(f1,f2))][,.N,f1][N>=sizeM,f1]
# only keep these firms
ad$sdata = ad$sdata[f1 %in% fids]
ad$jdata = ad$jdata[f1 %in% fids][f2 %in% fids]
reshyb<-list()
set.seed(324313)
for (itN in 1:leaveoutN){
flog.info("[Hybrid-leave-out] Iteration=%i", itN)
res_akm_blm_leave_out<-Do.leaveout.wrap(ad, nm_list, itN)
if (itN==1) {
reshyb<-res_akm_blm_leave_out
} else if (itN>1){
reshyb<-rbind(reshyb, res_akm_blm_leave_out)
}
}
res_hybrid=list()
res_hybrid$mean<-setDT(reshyb)[,lapply(.SD, mean, na.rm=TRUE), by=.(share_mover_keep,hetero) ]
res_hybrid$sd<-setDT(reshyb)[,lapply(.SD, sd, na.rm=TRUE), by=.(share_mover_keep,hetero) ]
# # use cluster if available
# if (!any(is.na(cl))) {
# flog.info("cluster -- exporting objects to nodes")
# # export environment to nodes
# clusterExport(cl,c("ad","nm_list", "leaveoutN"),environment())
# mylapply <- function(...) parLapply(cl,...)
# nnodes=length(cl)
# } else {
# mylapply <- function(...) lapply(...)
# nnodes=1
# }
#
#
# flog.info("starting repetitions with %i nodes",nnodes)
# reshyb = mylapply(1:leaveoutN, function(ij) {
# res_hyb = list()
# tryCatch({
# res_hyb=Do.leaveout.core2(ad,nm_list)
# }, error = function(e) {catf("error in rep %i!\n",ij);print(e);})
# flog.info("done with reptitions %i/%i",ij,leaveoutN)
# res_hyb$it=ij
# res_hyb
# })
#
#
# temp=melt(reshyb) #L2: variable, L1: it
# res_hybrid=list()
# res_hybrid$mean<-setDT(temp)[,lapply(.SD, mean, na.rm=TRUE), by=.(L2) ]
# res_hybrid$sd<-setDT(temp)[,lapply(.SD, sd, na.rm=TRUE), by=.(L2) ]
return(res_hybrid)
}
#' @export
#'
Do.leaveout.wrap<-function(sim, nm_list, itN){
#Do.leaveout.wrap<-function(sim, nm_list){
resall=data.frame()
# run AKM on this set of firms
rr = m2.trace.estimate(sim,hetero=FALSE)
with(rr$stats,psi_var/set_logwage_var)
rr$stats$share_mover_keep = 100
rr$stats$itN = itN
resall=rbind(resall,rr$stats)
for (nm_i in nm_list) { #list of worker leave outs
triali <- 1
errori=1
res_n=data.frame()
while ( (errori==1) & (triali <= 10) ) {
triali <- triali + 1
errori=0
res_n<-try(Do.leaveout.core(sim, nm_i, itN))
# res_n<-try(Do.leaveout.core(sim, nm_i))
if (is(res_n, "try-error")){
errori=1
}
}
resall<-rbind(resall,res_n)
}
return(resall)
}
#' @export
#Do.leaveout.core<-function(sim, share_mover_keep) {
Do.leaveout.core<-function(sim, share_mover_keep, itN) {
#Core of estimation for each share of movers kept
rrr = data.frame()
flog.info("[Hybrid-leave-out] Share of movers kept=%2f", share_mover_keep)
sim$jdata = sim$jdata[rank(runif(.N))/.N<=share_mover_keep]
rr = m2.trace.estimate(sim,hetero=FALSE)
rr$stats$share_mover_keep = share_mover_keep
rr$stats$itN = itN
rrr = rbind(rrr,data.frame(rr$stats))
rr = m2.trace.estimate(sim,hetero=TRUE)
rr$stats$share_mover_keep = share_mover_keep
rr$stats$itN = itN
rrr = rbind(rrr,data.frame(rr$stats))
return(rrr)
}
#' @export
Do.leaveout.core2<-function(sim, nm_list) {
# Do.leaveout.core<-function(sim, share_mover_keep, itN) {
resall=data.frame()
# run AKM on this set of firms
rr = m2.trace.estimate(sim,hetero=FALSE)
with(rr$stats,psi_var/set_logwage_var)
rr$stats$share_mover_keep = 100
# rr$stats$itN = itN
resall=rbind(resall,rr$stats)
for (share_mover_keep in nm_list) { #list of worker leave outs
#Core of estimation for each share of movers kept
rrr = data.frame()
flog.info("[Hybrid-leave-out] Share of movers kept=%2f", share_mover_keep)
sim$jdata = sim$jdata[rank(runif(.N))/.N<=share_mover_keep]
rr = m2.trace.estimate(sim,hetero=FALSE)
rr$stats$share_mover_keep = share_mover_keep
# rr$stats$itN = itN
rrr = rbind(rrr,data.frame(rr$stats))
rr = m2.trace.estimate(sim,hetero=TRUE)
rr$stats$share_mover_keep = share_mover_keep
# rr$stats$itN = itN
rrr = rbind(rrr,data.frame(rr$stats))
#return(rrr)
}
resall=rbind(resall,rrr)
return(resall)
}
##---COMMENTS FOR CONFIDENTIALITY CHECK DE----
#'@param reporting
#'@export
Do.info.DE.project<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This file estimates variance decompositions of log wages by correcting for mobility bias.
FOR CONFIDENTIALITY CHECK:
The file gives out estimation steps with additional estimation based
information. Data related output will be explained in a note right before the table
All output will be produced in two consecutive tables: one for the confidentiality check and
one for us researchers to copy paste into a csv file.
Raw data is used for all estimations but the output is most closely related to case numbers in
D 4) Event Study
D 5) Summary Stats
D 6) Summary of Distributions
where the case numbers are reported within the tables. In the estimation
D 2) Trace estimation
D 7) Estimating model
the output are Variance decompositions.The case number of observation used to compute
those are given within the table.
D 9) Is a composite command that combines D 6), D 2) and part of D 7). Not all
results are requested with each run.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.project.new<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This file estimates variance decompositions of log wages by correcting for mobility bias under different settings.
FOR CONFIDENTIALITY CHECK:
Data related output will be explained in a note right before the table.
All output will be produced in two consecutive tables: one for the confidentiality check and
one for us researchers to copy paste into a csv file.
Raw data is used for all parameter estimations. However most output is obtained through simulations
after parameter estimates have been obtained with the raw data. Hence, most results are simulation-
based and a case number/#of firms is not strictly necessary according to the FDZ guidelines.
We will explain which results use the raw data in turn and which are purely simulation based.
---------------------------------------------------------------------------------------------")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.project.running<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This file shows the progression stes/intermediate steps taken during the estimation.
It is helpful for debugging when the code stopped and no final output file was created.
---------------------------------------------------------------------------------------------")
flog.warn(Info_T)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.trace<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following estimation computes the correction to the variance decomposition and does this for
D. 2a) for different assumptions about the homoskedasticity of errors
D. 2b) different samples (full sample, connected set of firms (cf. AKM), leave-out xset of firms (cf. Kline et al 2018).
D. 2c) for different assumptions about the inclusion of movers in the estimation.")
flog.warn(Info_T)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.event<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following estimation computes an event study in which we compute the average wage of workers before a move
and the average wage of workers after a move for 4 firm typen, defined as in BLM (2018) or Kline (2018) through
average wages. That is, firms with a similar cdf of average worker wages will be grouped together in one group.")
flog.warn(Info_T)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.sets<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following estimation computes summary statistics for the different sets used
(full sample, connected set of firms (cf. AKM), leave-out xset of firms (cf. Kline et al 2018)).")
flog.warn(Info_T)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.stats<-function(reporting){
if (reporting==0){
invisible(Info_S<-"This estimation the summary statistics by firm cluster 1 to 10 and for all firms combined. Firm clusters
are defined through average worker wages as described in BLM 2018
Describtive Statistics
The rows describe the contained information, where the first row describes the firm cluster number
FOR CONFIDENTIALITY TEST:
In this table, results computed with less than the required min. number of obs. are denoted as NaN for confidentiality.")
flog.warn(Info_S)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.wages<-function(reporting){
if (reporting==0){
invisible(Info_S<-"This estimation the summary statistics by firm cluster 1 to 10 and for all firms combined. Firm clusters
are defined through average worker wages as described in BLM 2018
Wage differences
This table shows the average wage difference of workers moving between firms of type x/y (row/column).
FOR CONFIDENTIALITY TEST:
Wage differences for cases with less than the required min. number of obs. have been removed and are displaced as NaN.")
flog.warn(Info_S)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.stats.mobility<-function(reporting){
if (reporting==0){
invisible(Info_S<-"This estimation the summary statistics by firm cluster 1 to 10 and for all firms combined. Firm clusters
are defined through average worker wages as described in BLM 2018
Moving patterns
This table shows the number of workers moving between firms of type x/y (row/column)")
flog.warn(Info_S)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.sets.table<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports summary statistics for the two sets.")
flog.warn(Info_T)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.event.table<-function(reporting){
if (reporting==0){
invisible(Info_ES<-"The following table reports:
- first row: mean wage of workers in period -1 starting at firm group x
- second row: mean wage of workers in period 1 ending up at firm group y
- third row: # of firms in cell
- fourth row: # of workers in cell
- fifth row: mobility from group of firms x to group of firm y (e.g. 1 to 1, 1 to 4 etc)
FOR CONFIDENTIALITY TEST:
In this table, results computed with less than the min number of required obs. are denoted as NaN (missing) for confidentiality.")
flog.warn(Info_ES)
}
}
#'Hints and notes for confidentiality check in Germany
#'@param reporting
#'@export
Do.info.DE.trace.table<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the wage variance contributions when controlling for biases
under certain assumptions on the error variance.
FOR CONFIDENTIALITY TEST:
This table is only shown if results were computed with more than the required
min number of obs. (cf. Do.NaN.trace just before)
The table contains for the confidentiality check
- total number of firms (second column)
- total number of stayers and total number of workers: number of
workers (4th and 5th column)
Caution: 'set number of firms=10' just means that we group firms
into ten groups, not # of firms!")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.estimate<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following estimates the variance decomposition of wages using the BLM model. It hence estimates the
model and then SIMULATES data to estimate the variance decomposition.Different estimators are used for this.
a) LL estimator
b) LP estimator
c) QL estimator
d) MX estimator.
After each of these estimations, we put out a covariance matrix of wage components due to workers and firms.
After all desired estimators have been computed we further put out a table with variance decompositions across all stimators.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_interactedEEm<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table shows the expected, estimated wage for workers of different type (rows)
and firms of different type(columns) estimated through the interaction algorithm. These are outputs of simulated data
given the estimated parameters of the model. Hence no data is reported here.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_interactedquantiles<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table shows the expected, estimated wage (3rd column) for workers of different type (first column)
and firms of different type (second columns) estimated through the interaction algorithm. Within the
firm and worker types we further distinguish according to the distribution of
productivity of the worker type (before last column). These are outputs of simulated data
given the estimated parameters of the model. Hence no data is reported here.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_randomattrition<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following shows the variance decomposition of wages using the BLM model. It hence estimates the
model and then SIMULATES data to estimate the variance decomposition.Different estimators are used for this.
a) AKM
b) CRE
c) CRE-0
d) Interacted CRE
e) TraceHE
f) TRaceHO
All of these estimators make different assumptions about the DGP.
Furthermore, in this estimation we restrict attention to a given number of workers (# of workers in column 15) )
that are drawn randomly (number of draw is in the last column).
These are outputs of simulated data given the estimated parameters of the model. Hence no data is reported here.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_samplesize<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This table shows the case number for each set (column 1) that we compute in the model.
The second column shows the number of firms, the third the number of workers.")
flog.warn(Info_T)
}
}
Do.explanations<-function(filename){
reporting=0
if ((filename=='final_interactedEEm_DE-test') | (filename=='final_interactedEEm_DE')) {
Do.info.DE.final_interactedEEm(reporting)
} else if ((filename=='final_interactedquantiles_DE-test')| (filename=='final_interactedquantiles_DE')) {
Do.info.DE.final_interactedquantiles(reporting)
} else if ((filename=='final_output_DE-test')| (filename=='final_output_DE')) {
Do.info.DE.final_output(reporting)
} else if ((filename=='final_randomattrition_DE-test') |(filename=='final_randomattrition_DE')) {
Do.info.DE.final_randomattrition(reporting)
} else if ((filename=='final_samplesize_DE-test')|(filename=='final_samplesize_DE')) {
Do.info.DE.final_samplesize(reporting)
} else if ((filename=='final_sortingstats_DE')|(filename=='final_sortingstats_DE-test')) {
Do.info.DE.final_sortingstats(reporting)
} else if ((filename=='final_clusterstats_DE-test')|(filename=='final_clusterstats_DE')) {
Do.info.DE.final_clusterstats(reporting)
} else if ((filename=='final_interactedcoefs_DE-test')| (filename=='final_interactedcoefs_DE')){
Do.info.DE.final_interactedcoefs(reporting)
}
}
#'@param reporting
#'@export
Do.info.DE.final_interactedcoefs<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the results of the estimation
of the interacted model, where A and B are the two cluster-specific parameters.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_clusterstats<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the distribution of wages within clusters.
The table contains for the confidentiality check
- the cluster ID (first column)
- the number of firms (second column)
- the mean size (3rd column)
- the number of firms with more than 10 or 50 movers (4th and 5th)
- number of workers in cell (6th column)
- the remaining columns: wage statistics: wage mean, variance, skewness, kurtosis")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_sortingstats<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the distribution of wages.
The table contains for the confidentiality check
- wages: log wage
- N: the number of observations in each bin of the wage distribution
this is done fr each worker and firm cluster combination (first two columns).")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.final_output<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following shows the variance decomposition of wages using the BLM model. It hence estimates the
model and then SIMULATES data to estimate the variance decomposition.Different estimators are used for this.
a) AKM
b) CRE
c) CRE-0
d) Interacted CRE
e) TraceHE
f) TRaceHO
All of these estimators make different assumptions about the DGP.These are outputs of simulated data
given the estimated parameters of the model. Hence no data is reported here.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.estimate.decomp<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This table shows the variance decomposition for all four estimation types
- cor_kl : correlation firm and worker part of wages
- cov_kl : covariance firm and worker part of wages
- var_k : variance firm part wages
- var_l : variance worker part wages
- rsq: RSQ of estimation model
- N_s: Number of observations of stayers
- N_j: Number of observations for movers
- Share: Share of movers in total observations.
FOR CONFIDENTIALITY TEST:
This table is only shown if results were computed with more than the required min number of obs.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.estimate.window<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following estimates the variance decomposition of wages using the BLM model. It hence estimates the
model and then SIMULATES data to estimate the variance decomposition.Different estimators are used for this.
a) LL estimator
b) LP estimator
c) QL estimator
d) MX estimator.
After all desired estimators have been computed we further put out a table with variance decompositions across all stimators.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.dist<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the distribution of the mobility patterns across firms.
The table contains for the confidentiality check
- N: number of movers in a firm
- N_f: number of firms with N movers
- cdf_N_f: cdf of this number of movers.
Please note: N=1 does NOT mean that there is only one observation, hence this does not
need to be deleted.
e.g. N=2 N_f=1045 cdf_N_f=.51 means that there are 1045 firms with 2 movers
and that 51 % of all observations of movers are in this case (The number of observations
in each cell is therefore N_f*N.)")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.dist.wage<-function(reporting){
if (reporting==0){
invisible(Info_T<-"The following table reports the distribution of wages.
The table contains for the confidentiality check
- wage: log wage
- cdf_wage: cdf of this wage
- N: the number of observations in each bin of the wage distribution
(taken from hist).")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.dist.all<-function(reporting){
if (reporting==0){
invisible(Info_T<-"Below we analyze the distribution of movers within firms and the distribution of wages.")
flog.warn(Info_T)
}
}
#'@param reporting
#'@export
Do.info.DE.composite<-function(reporting){
if (reporting==0){
invisible(Info_T<-"This analysis conducts several estimations at once
A) Summary of data subsets
B) AKM estimation
C) Trace estimation
D) BLM trace estimation
E) Clustering and estimation of BLM model.")
flog.warn(Info_T)
}
}
##---EXPORTING, NICE TABLES----
#' Write csv table to log file, for DE
#'
#' @export
Do.log.table<-function(opts, filename, print_row_name=FALSE) {
#Read output file
data <- fread(paste0(opts$paths$final, filename, ".csv"))
flog.info("Output as Table")
print(data, row.names = FALSE, topn=nrow(data))
flog.info("Output as CSV Table")
cat(sprintf("####### @CSVBEGIN %s\n",filename))
write.csv.log(data)
cat(sprintf("####### @CSVEND %s\n",filename))
}
#' Export a data.frame either to the log file or directly to disk
#'
#' @param type 0 to save to disk, 1 to send to the log
#' @export
Do.export.table<-function(data, filename, type=0, print_row_name=FALSE) {
# save to file
if (type==0) {
flog.info("[saving] Saving results to %s",filename)
write.csv(data,filename,row.names = print_row_name)
# print in console in CSV format, wrap into tags
} else {
flog.info("Output as Table")
print(data, row.names = FALSE, topn=nrow(data))
flog.info("Output as CSV Table")
cat(sprintf("####### @CSVBEGIN %s\n",filename))
write.csv.log(data)
cat(sprintf("####### @CSVEND %s\n",filename))
}
}
#' @export
write.csv.log <- function(d) {
tc <- textConnection("NewLog", "w") #"a" to append, "w" to write new
write.csv(d,file=tc)
close(tc)
cat(paste0(NewLog,"\n"))
}
#' @export
Do.export.event<-function(s2, directory_Res, country, reporting){
file_id=paste0(directory_Res, "res_event_", country, ".eps")
if (reporting==1){
setEPS()
postscript(file_id)
matplot(c(-1,1),s2[1:2,1:4], type="b", xlab="Time (0=move)", ylab="Mean Log Wage of Movers",pch=15:19, xaxp = c(-1, 1, 2) , lwd=2)
names<-c("1 to 1", "1 to 4", "4 to 1", "4 to 4")
legend("bottom", inset=1, legend=names, col=c(1:5),pch=15:19,
bg= ("white"), xpd = TRUE, horiz = TRUE, bty = "n", cex = 2)
dev.off()
} else if (reporting==0) {
s2
s2[]
print("Event Study")
Do.export.results(s2, reporting, file_id)
}
}
#Function creates table with summary of different estimators for BLM var decomp
#' @export
Do.decomp.table<-function(res_N, num_obs, Ns, Nm, country, directory_Res, reporting){
share<-Ns/(Ns+Nm)
X_N=data.table(N_s=rep(Ns,ncol(res_N)), N_j=rep(Nm,ncol(res_N)), Share=rep(share, ncol(res_N)))
cdt=data.table(Country=rep(country,ncol(res_N)))
edt=data.table(Est=c(( if ("ll" %in% opts$do) "LL" else {}), ( if (("lp" %in% opts$do)) "LP" else {}), ( if (("ql" %in% opts$do)) "QL" else {}), ( if (("mx" %in% opts$do)) "MX" else {})))
dd=cbind(cdt, edt, t(res_N), rbind(X_N))
if (reporting==0){
invisible(Info_T<-"This table shows the variance decomposition for all four estimation types
- cor_kl : correlation firm and worker part of wages
- cov_kl : covariance firm and worker part of wages
- var_k : variance firm part wages
- var_l : variance worker part wages
- rsq: RSQ of estimation model
- N_s: Number of observations of stayers
- N_j: Number of observations for movers
- Share: Share of movers in total observations.
FOR CONFIDENTIALITY TEST:
This table is only shown if results were computed with more than the required min number of obs.")
flog.info(Info_T)
}
return(dd)
}
#Function computes group averages
#' @export
Do.export.groups<-function(rr2,file_id, reporting ){
try({
if (reporting==1) {
ggplot(rr2,aes(x=l,y=value,color=factor(k))) + geom_line() + geom_point() + theme_bw()
ggsave(file_id)
} else if (reporting==0) {
flog.warn("Average wage for workers of type l at firm type k. These Results are based on SIMULATIONS, not on data.
Hence there are no case numbers reported!")
flog.warn("Worker and firm types are estimated within the BLM framework. Firm types
represent types as defined by average worker wages per firm. Worker types are
estimated as in BLM (2018). ")
Do.export.table(data=rr2,filename=file_id, type=opts$export_type, print_row_name=TRUE)
}})
}
#' Export Latex Files of Results
#'
#' @param data Data
#' @param reporting reporting wether to save to file or print only on screen ( & into Log file)
#' @param file_id name for File to save
#' @param align_options options for latex table
#' @export
Do.export.results<-function(data,reporting, file_id, digitsx=NULL, align_option={}){
if (missing(align_option)){
if (reporting==1){
print(xtable(data, type = "latex", digits=digitsx), file = file_id, floating = FALSE)
} else if (reporting==0) {
flog.info("Output as Table")
print(data, row.names = FALSE, topn=nrow(data))
flog.info("Output as Latex Table")
print(xtable(data, type = "latex", digits=digitsx), file = "", floating = FALSE)
}
} else {
if (reporting==1){
print(xtable(data, type = "latex", digits=digitsx,align=align_option), file = file_id, floating = FALSE)
} else if (reporting==0) {
flog.info("Output as Table")
print(data, row.names = FALSE, topn=nrow(data))
flog.info("Output as Latex Table")
print(xtable(data, type = "latex", digits=digitsx, align=align_option), file = "", floating = FALSE)
}
}
}
#This function removes results computed with less observations than the minim required number (set as option)
#for the Hybrid Tables
#' @export
Do.NaN.Hybrid<-function(res_Hybrid_connected, opts_min=opts$minN) {
setDT(res_Hybrid_connected)
x_nm<-res_Hybrid_connected$nm[(res_Hybrid_connected$variable=="nw" & res_Hybrid_connected$value< opts$minN)]
x_set<-res_Hybrid_connected$set[(res_Hybrid_connected$variable=="nw" & res_Hybrid_connected$value< opts$minN)]
res_Hybrid_connected$value[((res_Hybrid_connected$nm %in% x_nm)& (res_Hybrid_connected$set %in% x_set))]<-NaN
return(res_Hybrid_connected)
}
#This function removes results computed with less observations than the minim required number (set as option)
#for the trace tables
#' @export
Do.NaN.trace<-function(res_Hybrid_connected, opts_min=opts$minN) {
res_Hybrid_connected[((res_Hybrid_connected$total_number_of_movers+res_Hybrid_connected$total_number_of_stayers)< opts$minN)]<-NaN
return(res_Hybrid_connected)
}
#This function exports tables to csv files or as .tex tables to the log file
#' @export
Do.generate.table <- function(res, nm_list, leave_out=0, sds={}) {
res = data.table(res)
res = res[nm %in% nm_list]
nc = nrow(res)+1
if (leave_out==0){
tt =
tt_text_row("min number of movers") %:% tt_numeric_row(res$nm,dec=0)
tt=tt+tt_text_row("n firms with own type") %:% tt_numeric_row(res$firms_with_own_type,dec=0) +
tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("Connected set",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set0_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set0_nf,dec=0)
tt = tt+tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM, firms with few movers clustered, Connected set",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set1_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set1_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set1_psi_var,big.mark="") +
tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set1_psi_var/res$set1_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Var(\\psi)$ cor") %:% tt_numeric_row(res$set1_psi_var-res$set1_trace,big.mark="") +
tt_text_row("$Var(\\psi)$ cor share")%:% tt_numeric_row(100*(res$set1_psi_var-res$set1_trace)/res$set1_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set1_cov,big.mark="")+
tt_text_row("Within cluster firm variance") %:% tt_numeric_row(res$set1_omega_var,big.mark="")+
tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM, Leave out connected set",nc-1,"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set2_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set2_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set2_psi_var,big.mark="") +
tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set2_psi_var/res$set2_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Var(\\psi)$ cor") %:% tt_numeric_row(res$set2_psi_var - res$set2_trace,big.mark="") +
tt_text_row("$Var(\\psi)$ cor share") %:% tt_numeric_row(100*(res$set2_psi_var - res$set2_trace)/res$set2_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set2_cov,big.mark="")+
tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM - Homo - firms with own type only",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set3_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set3_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set3_psi_var,big.mark="") +
tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set3_psi_var/res$set3_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set3_cov,big.mark="")+
tt_spacer_row(4)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM - Hetero - firms with own type only",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set4_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set4_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set4_psi_var,big.mark="") +
tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set4_psi_var/res$set4_wage_var,big.mark="",percentage=T,dec=1) +
tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set4_cov,big.mark="")+
tt_spacer_row(4)
} else if (leave_out==1){
tt =
tt_text_row("number of movers left-out") %:% tt_numeric_row(res$nm,dec=0)
tt = tt+tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM, firms with few movers clustered, Connected set",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set1_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set1_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set1_psi_var,big.mark="") +
tt_text_row("") %:% tt_numeric_row(sds$set1_psi_var,big.mark="") +
# tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set1_psi_var/res$set1_wage_var,big.mark="",percentage=T,dec=1) +
#
# tt_text_row("$Var(\\psi)$ cor") %:% tt_numeric_row(res$set1_psi_var-res$set1_trace,big.mark="") +
# tt_text_row("$Var(\\psi)$ cor share")%:% tt_numeric_row(100*(res$set1_psi_var-res$set1_trace)/res$set1_wage_var,big.mark="",percentage=T,dec=1) +
# tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set1_cov,big.mark="")+
# tt_text_row("Within cluster firm variance") %:% tt_numeric_row(res$set1_omega_var,big.mark="")+
tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM, Leave out connected set",nc-1,"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set2_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set2_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set2_psi_var,big.mark="") +
tt_text_row("") %:% tt_numeric_row(sds$set2_psi_var,big.mark="") +
# tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set2_psi_var/res$set2_wage_var,big.mark="",percentage=T,dec=1) +
# tt_text_row("$Var(\\psi)$ cor") %:% tt_numeric_row(res$set2_psi_var - res$set2_trace,big.mark="") +
# tt_text_row("$Var(\\psi)$ cor share") %:% tt_numeric_row(100*(res$set2_psi_var - res$set2_trace)/res$set2_wage_var,big.mark="",percentage=T,dec=1) +
# tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set2_cov,big.mark="")+
tt_spacer_row(7)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM - Homo - firms with own type only",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set3_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set3_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set3_psi_var,big.mark="") +
tt_text_row("") %:% tt_numeric_row(sds$set3_psi_var,big.mark="") +
# tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set3_psi_var/res$set3_wage_var,big.mark="",percentage=T,dec=1) +
# tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set3_cov,big.mark="")+
tt_spacer_row(4)
tt = tt +
tt_text_row("") %:% tt_text_row("AKM - Hetero - firms with own type only",nrow(res),"c") + tt_rule_mid_partial(list(c(2,nc))) +
tt_text_row("n workers") %:% tt_numeric_row(res$set4_nw,dec=0) +
tt_text_row("n firms") %:% tt_numeric_row(res$set4_nf,dec=0) +
tt_text_row("$Var(\\psi)$") %:% tt_numeric_row(res$set4_psi_var,big.mark="") +
tt_text_row("") %:% tt_numeric_row(sds$set4_psi_var,big.mark="") +
# tt_text_row("$Var(\\psi)$ share") %:% tt_numeric_row(100*res$set4_psi_var/res$set4_wage_var,big.mark="",percentage=T,dec=1) +
# tt_text_row("$Cov(\\psi,alpha_i)$") %:% tt_numeric_row(res$set4_cov,big.mark="")+
tt_spacer_row(4)
}
tt
}
##---COMPARATIVE RESULTS----
#' Do Compute stats for sub-groups
#'
#' @param ad Data
#' @param all_coworkers 1:compute sample split for size of firm
#' @export
Do.groups<-function(ad,all_coworkers=0, opts_min=opts$minN) {
stats<-data.frame()
for (speci in 1:14){
#Initialize
stat<-list()
sim<-ad
if (speci==1) {
stat$spec<-"young"
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
mean_age<-mean(sim$tdata$age,na.rm=TRUE)
#Only for young workers
sim$sdata<-sim$sdata[age<=mean_age,]
sim$jdata<-sim$jdata[age<=mean_age,]
} else if (speci==2) {
stat$spec<-"old"
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
mean_age<-mean(sim$tdata$age,na.rm=TRUE)
#Only for young workers
sim$sdata<-sim$sdata[age>mean_age,]
sim$jdata<-sim$jdata[age>mean_age,]
} else if (speci==3) {
stat$spec<-"manuf"
#Only manuf
sim$sdata<-sim$sdata[manuf==1,]
sim$jdata<-sim$jdata[manuf==1,]
} else if (speci==4) {
stat$spec<-"No manuf"
#Only manuf
sim$sdata<-sim$sdata[manuf==0,]
sim$jdata<-sim$jdata[manuf==0,]
} else if ((speci==5) & (all_coworkers==1)) {
stat$spec<-"large"
#Firm Size
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
sim$tdata[,Ij:=rowid(f1)]
sim$tdata[,mw:=.N,by=f1]
setkey(sim$tdata,f1)
sim$jdata[sim$tdata,mw:=i.mw]
sim$sdata[sim$tdata,mw:=i.mw]
mean_size<-mean(sim$tdata$mw[sim$tdata$Ij==1],na.rm=TRUE)
sim$sdata<-sim$sdata[N_j>mean_size,]
sim$jdata<-sim$jdata[N_j>mean_size,]
} else if ((speci==6) & (all_coworkers==1)) {
stat$spec<-"small"
#Firm Size
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
sim$tdata[,Ij:=rowid(f1)]
sim$tdata[,mw:=.N,by=f1]
setkey(sim$tdata,f1)
sim$jdata[sim$tdata,mw:=i.mw]
sim$sdata[sim$tdata,mw:=i.mw]
mean_size<-mean(sim$tdata$mw[sim$tdata$Ij==1],na.rm=TRUE)
sim$sdata<-sim$sdata[N_j<=mean_size,]
sim$jdata<-sim$jdata[N_j<=mean_size,]
} else if (speci==7){
#Avaerge Firm quality
stat$spec<-"high W."
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw>mean_w,]
sim$jdata<-sim$jdata[mw>mean_w,]
} else if (speci==8){
#Avaerge Firm quality
stat$spec<-"Low W."
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw<=mean_w,]
sim$jdata<-sim$jdata[mw<=mean_w,]
} else if (speci==9){
# Firm Age
stat$spec<-"Young Firms"
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
ts<-sim$tdata[,list(mw=mean(firm_age, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-mean(ts$mw, na.rm=TRUE)
sim$sdata<-sim$sdata[mw<=mean_w,]
sim$jdata<-sim$jdata[mw<=mean_w,]
}else if (speci==10){
# Firm Age
stat$spec<-"Old Firms"
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
ts<-sim$tdata[,list(mw=mean(firm_age, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-mean(ts$mw, na.rm=TRUE)
sim$sdata<-sim$sdata[mw>mean_w,]
sim$jdata<-sim$jdata[mw>mean_w,]
} else if (speci==11) {
stat$spec<-"low W, old"
sim$txdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
mean_age<-mean(sim$txdata$age,na.rm=TRUE)
sim$tdata<-rbind(sim$sdata[age>mean_age], sim$jdata[age>mean_age], fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw<=mean_w,]
sim$jdata<-sim$jdata[mw<=mean_w,]
} else if (speci==12) {
stat$spec<-"high W, old"
sim$txdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
mean_age<-mean(sim$txdata$age,na.rm=TRUE)
sim$tdata<-rbind(sim$sdata[age>mean_age], sim$jdata[age>mean_age], fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw>mean_w,]
sim$jdata<-sim$jdata[mw>mean_w,]
} else if (speci==13) {
stat$spec<-"low W, old firms"
sim$txdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
firm_d<-sim$txdata[, list(mw=mean(firm_age)), by=f1]
mean_age<-mean(firm_d$mw,na.rm=TRUE)
sim$tdata<-rbind(sim$sdata[firm_age>mean_age], sim$jdata[firm_age>mean_age], fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw<=mean_w,]
sim$jdata<-sim$jdata[mw<=mean_w,]
} else if (speci==14) {
stat$spec<-"high W, old firms"
sim$txdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
firm_d<-sim$txdata[, list(mw=mean(firm_age)), by=f1]
mean_age<-mean(firm_d$mw,na.rm=TRUE)
sim$tdata<-rbind(sim$sdata[firm_age>mean_age], sim$jdata[firm_age>mean_age], fill=TRUE)
ts<-sim$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
sim$jdata[ts,mw:=i.mw, on=("f1")]
sim$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
sim$sdata<-sim$sdata[mw>mean_w,]
sim$jdata<-sim$jdata[mw>mean_w,]
}
flog.info("[compare groups] Specification: %s",stat$spec)
if ((speci!=6 & speci!=5) | ((speci==6) & (all_coworkers==1)) | ((speci==5) & (all_coworkers==1))) {
#Cluster anew
#measure for clustering
ms = grouping.getMeasures(sim,"ecdf",Nw=20,y_var = "y1")
# then we group we choose k=10
grps = grouping.classify.once(ms,k = 10,nstart = 1000,iter.max = 200,step=250)
# finally we append the results to adata
sim = grouping.append(sim,grps$best_cluster,drop=T)
#Stats
stat$age<-mean(c(sim$sdata$age, sim$jdata$age), na.rm=TRUE)
stat$manuf<-mean(c(sim$sdata$manuf, sim$jdata$manuf), na.rm=TRUE)
sim$tdata<-rbind(sim$sdata, sim$jdata, fill=TRUE)
sim$tdata[,Ij:=rowid(f1)]
sim$tdata[,mw:=mean(firm_age, na.rm=T),by=f1]
sim$tdata[,msz:=.N,by=f1]
stat$mw<-mean(sim$tdata$mw[sim$tdata$Ij==1],na.rm=TRUE)
stat$N_j<-mean(sim$tdata$msz[sim$tdata$Ij==1],na.rm=TRUE)
#Estimate Model
mstats = sim$jdata[,list(m1=mean(y1),sd1=sd(y1),
m2=mean(y2),sd2=sd(y2),
v12 = cov(y1,y2),.N),list(j1,j2)]
cstats = sim$sdata[,list(m1=mean(y1),sd1=sd(y1),
m2=mean(y2),sd2=sd(y2),
v12 = cov(y1,y2),.N),list(j1)]
res_ql = m2.minirc.estimate(cstats,mstats,method = 2)
#Save Results
stat$rsq<-res_ql$vdec$rsq1
stat$sig_psi<-res_ql$vdec$stats$var_l
stat$sig_alph<-res_ql$vdec$stats$var_k
stat$cor<-res_ql$vdec$stats$cor_kl
stat$cov<-res_ql$vdec$stats$cov_kl
#Size
stat$Ns<-nrow(sim$sdata)
stat$Nm<-nrow(sim$jdata)
stat$share<-stat$Nm/(stat$Nm+stat$Ns)
stats<-rbind(stats,stat)
}
if (speci==1){
stats$spec<-as.character(stats$spec)
}
}
flog.info("Only Results with required min obs printed")
indx_more20_ss<-matrix((stats$Nm<opts_min),nrow=nrow(stats),(ncol=ncol(stats)))
stats[indx_more20_ss]<-NaN
return(stats)
}
#' @export
#'
Do.comparison.time<-function(directory_Data, data_type, filtering, residualize=1, opts_min=opts$minN) {
#Note: this analysis requires a file sdata_all.csv and jdata_all.csv
#Read in the data with information for all years
ad<-Do.reading.data(directory_Data, filtering, data_suffix="_all", data_type)
stats<-data.table()
for (year in (min(ad$sdata$anno)):(max(ad$sdata$anno)-1)){
#Group firms for this year
ad1<-list(sdata=ad$sdata[(ad$sdata$anno==year)], jdata=ad$jdata[(ad$jdata$anno==year)])
#Residualize data
ad1<-Do.residualize(ad1, residualize)
ad1.copy<-ad1
for (speci in 1:3) {
ad1<-ad1.copy
stat=list()
flog.info("[compare across time] Year: %i",year)
stat$years<-year
stat$spec="all"
if (speci==2){
#Avaerge Firm quality
stat$spec<-"high W."
ad1$tdata<-rbind(ad1$sdata, ad1$jdata, fill=TRUE)
ts<-ad1$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
ad1$jdata[ts,mw:=i.mw, on=("f1")]
ad1$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
ad1$sdata<-ad1$sdata[mw>mean_w,]
ad1$jdata<-ad1$jdata[mw>mean_w,]
} else if (speci==3){
stat$spec<-"low W."
ad1$tdata<-rbind(ad1$sdata, ad1$jdata, fill=TRUE)
ts<-ad1$tdata[,list(mw=mean(y1, na.rm=T), N_j=max(N_j)),by=f1]
setkey(ts,f1)
ad1$jdata[ts,mw:=i.mw, on=("f1")]
ad1$sdata[ts,mw:=i.mw, on=("f1")]
mean_w<-weighted.mean(ts$mw,ts$N_j, na.rm=TRUE)
ad1$sdata<-ad1$sdata[mw<=mean_w,]
ad1$jdata<-ad1$jdata[mw<=mean_w,]
}
flog.info("[compare groups] Specification: %s",stat$spec)
stat$sd<-sd(c(ad1$sdata$y1_l, ad1$jdata$y1_l), na.rm=TRUE)^2
#Cluster
ms = grouping.getMeasures(ad1,"ecdf",Nw=20,y_var = "y1")
# then we group we choose k=10
grps = grouping.classify.once(ms,k = 10,nstart = 1000,iter.max = 200,step=250)
# finally we append the results to adata
ad1 = grouping.append(ad1,grps$best_cluster,drop=T)
stat$sd2<-sd(c(ad1$sdata$y1_l, ad1$jdata$y1_l), na.rm=TRUE)^2
#Estimate
mstats = ad1$jdata[,list(m1=mean(y1),sd1=sd(y1),
m2=mean(y2),sd2=sd(y2),
v12 = cov(y1,y2),.N),list(j1,j2)]
cstats = ad1$sdata[,list(m1=mean(y1),sd1=sd(y1),
m2=mean(y2),sd2=sd(y2),
v12 = cov(y1,y2),.N),list(j1)]
res_ql = m2.minirc.estimate(cstats,mstats,method = 2)
stat$rsq<-res_ql$vdec$rsq1
stat$sig_psi<-res_ql$vdec$stats$var_l
stat$sig_alph<-res_ql$vdec$stats$var_k
stat$cor<-res_ql$vdec$stats$cor_kl
stat$cov<-res_ql$vdec$stats$cov_kl
#Stats
stat$Ns<-nrow(ad1$sdata)
stat$Nm<-nrow(ad1$jdata)
stat$share<-stat$Nm/(stat$Nm+stat$Ns)
#Combine
stats<-rbind(stats,stat)
if (speci==1){
stats$spec<-as.character(stats$spec)
}
}
}
flog.info("Only Results with required min obs printed")
indx_more20_ss<-matrix((stats$Nm<opts_min),nrow=nrow(stats),(ncol=ncol(stats)))
stats[indx_more20_ss]<-NaN
return(stats)
}
#' @export
#'
Do.long_data<-function(sim){
# Reshape as single data vector
y1<-c(sim$jdata$y1, sim$sdata$y1,sim$jdata$y2, sim$sdata$y2 )
firm_ID<-as.numeric(c(sim$jdata$f1, sim$sdata$f1,sim$jdata$f2, sim$sdata$f2 ))
worker_ID <-as.numeric(c(sim$jdata$cod_pgr, sim$sdata$cod_pgr, sim$jdata$cod_pgr, sim$sdata$cod_pgr ))
sms = list(y1=y1, firm_ID=firm_ID, worker_ID=worker_ID)
return(sms)
}
#' @export
#'
Do.comparison.window.trace<-function(window_vec, year_vec, directory_Data, data_type, filtering, residualize=1, opts_min=opts$minN, res_path=opts$paths$res, opts_country=opts$country, reporting=opts$reporting) {
#Note: this analysis requires a file sdata_all.csv and jdata_all.csv
#Read in the data with information for all years
ad<-Do.reading.data(directory_Data, filtering, data_suffix="_all", data_type)
stats<-data.table()
#Get the set of years for which to perform analysis
if (year_vec==""){
year_vec<-unique(ad$sdata$anno)
}
for (year in year_vec){
for (window in window_vec){ #how many windows of data
#Get data for this year
ad1<-list(sdata=ad$sdata[anno>=(year-window+1)& anno <= year], jdata=ad$jdata[anno>=(year-window+1) & anno <= year])
#Residualize data
ad1<-Do.residualize(ad1, residualize)
stat=list()
flog.warn("[compare across time] Year: %i",year)
stat$years<-year
#A) Summary Stats
flog.info("Summary leave-in and leave-out set")
Do.info.DE.sets(opts$reporting)
res_sets<-Do.compare.sets(ad1, reporting=opts$reporting, opts_min=opts$minN)
flog.info("Export Leave-out comparison")
file_id = paste0(res_path,"res_sets_full_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.sets.table(opts$reporting)
Do.export.table(data=res_sets$full, filename=file_id, type=opts$export_type, print_row_name=TRUE)
file_id = paste0(res_path,"res_sets_connect_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.sets.table(opts$reporting)
Do.export.table(data=res_sets$connect, filename=file_id, type=opts$export_type, print_row_name=TRUE)
file_id = paste0(res_path,"res_sets_leave_out_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.sets.table(opts$reporting)
Do.export.table(data=res_sets$leave_out, filename=file_id, type=opts$export_type, print_row_name=TRUE)
# #B) AKM estimation
# long_data<-Do.long_data(ad1)
# setDT(long_data)
# # find the connected set of firms, keep only workers in that subset
# connected_set_of_firms <- akm.estimate.connected_set(long_data,ncat=100)
# long_data <- long_data[firm_ID %in% connected_set_of_firms$largest]
# # estimate AKM firm effects
# firm_effects <- akm.estimate.zig_zag(long_data, outcome_var = "y1")
# # computes AKM worker effects, given by average deviation from firm effect
# long_data[, akm_worker_effect := mean(y1 - akm_firm_effect), by = "worker_ID"]
# long_data[, akm_idiosyncratic := y1 - akm_worker_effect - akm_firm_effect]
# # extract statistics
# akm_statistics <- akm.estimate.extract_statistics(long_data, outcome_var = "y1")
# file_id = paste0(res_path,"res_akm_", opts_country,"-",window,"-",year ,".csv")
# Do.export.table(data=akm_statistics, filename=file_id, type=opts$export_type)
#C) Estimate Trace
Do.info.DE.trace(opts$reporting)
#C i) Trace with fixed effects
res_het<-m2.trace.estimate(ad1,hetero=T)
res_homo<-m2.trace.estimate(ad1,hetero=F)
res_trace<-rbind(res_het$stats, res_homo$stats)
flog.info("#Set NaN for results computed with less than min # of obs")
res_trace<-Do.NaN.trace(res_trace, opts_min=opts$minN)
file_id = paste0(res_path,"res_trace_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.trace.table(opts$reporting)
Do.export.table(data=res_trace, filename=file_id, type=opts$export_type)
#C ii) Trace with BLM with different sets
res_blm0<-m2.trace.blm(ad1,set=0, subsample=0)
res_blm1<-m2.trace.blm(ad1,set=1, subsample=0)
res_blm2<-m2.trace.blm(ad1,set=2, subsample=0)
res_trace_blm0<-rbind(res_blm0$stats, res_blm1$stats,res_blm2$stats)
flog.info("#Set NaN for results computed with less than min # of obs")
res_trace_blm0<-Do.NaN.trace(res_trace_blm0, opts_min=opts$minN)
file_id = paste0(res_path,"res_trblm0_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.trace.table(opts$reporting)
Do.export.table(data=res_trace_blm0, filename=file_id, type=opts$export_type)
#C ii) Trace with BLM for different subsamples
res_blm0<-m2.trace.blm(ad1,set=0, subsample=2)
res_blm1<-m2.trace.blm(ad1,set=1, subsample=2)
res_blm2<-m2.trace.blm(ad1,set=2, subsample=2)
res_trace_blm2<-rbind(res_blm0$stats, res_blm1$stats,res_blm2$stats)
flog.info("#Set NaN for results computed with less than min # of obs")
res_trace_blm2<-Do.NaN.trace(res_trace_blm2, opts_min=opts$minN)
file_id = paste0(res_path,"res_trblm2_", opts_country,"-",window,"-",year ,".csv")
Do.info.DE.trace.table(opts$reporting)
Do.export.table(data=res_trace_blm2, filename=file_id, type=opts$export_type)
#E) Cluster and estimate
flog.warn("Here we estimate firm types for future use as in BLM (2018)")
ms = grouping.getMeasures(ad1,"ecdf",Nw=20,y_var = "y1")
# then we group we choose k=10
grps = grouping.classify.once(ms,k = 10,nstart = 1000,iter.max = 200,step=250)
# finally we append the results to adata
ad1 = grouping.append(ad1,grps$best_cluster,drop=T)
Do.info.DE.estimate.window(opts$reporting)
#Estimate LL
res_ll=m2.mini.estimate(ad1$jdata, ad1$sdata, norm=1,model0=c(), method="linear", withx=FALSE)
stat$rsq<-res_ll$vdec$rsq1
invisible(capture.output(stat$var_l<-res_ll$vdec$stats$var_l))
invisible(capture.output(stat$var_k<-res_ll$vdec$stats$var_k))
invisible(capture.output(stat$cor_kl<-res_ll$vdec$stats$cor_kl))
invisible(capture.output(stat$cov_kl<-res_ll$vdec$stats$cov_kl))
#Stats
invisible(capture.output(stat$N_s<-nrow(ad1$sdata)))
invisible(capture.output(stat$N_j<-nrow(ad1$jdata)))
invisible(capture.output(stat$share<-stat$N_j/(stat$N_j+stat$N_s)))
#Combine
setDT(stat)
file_id = paste0(res_path,"res_ll_", opts_country,"-",window,"-",year ,".csv")
flog.warn("Combine Table with var decomp")
Do.info.DE.estimate.decomp(opts$reporting)
Do.export.table(data=stat, filename=file_id, type=opts$export_type, print_row_name=TRUE)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.