test_plmtest_unbalanced.R
In plm: Linear Models for Panel Data

# Test of new plmtest implementation (handling unbalanced panels)
#
# compare to grunfeld data example in Baltagi (2013), Econometric Analysis of Panel Data, 5th ed., p. 74-75 (Table 4.1/4.2)
#                                also Baltagi (2005), Econometric Analysis of Panel Data, 3rd ed., p. 65-66 (just Table 4.1,
#                                                                      table 4.2 in Baltagi (2005) is only Stata's xttest0
#                                                                      for Breusch-Pagan with chi2(1) = 798.16, Prob > chi2 = 0.0000)
#
# => statistics and p-values match => implementation of balanced tests is ok.
#
# The text book only supplies examples for the balanced Grunfeld data
# Thus, there are no reference values for an _un_balanced data set.
# -> compare calculation of EViews on an unbalanced data set (grunfeld without last observation)

# unbalanced formulas reduce in the case of a balanced panel to the formula for balanced panels:
#
#    balanced panel: => test output as in the text book => implementation is ok.
#  unbalanced panel: => test statistics for unbalanced panel differ from balanced panel
#                    => test matches calculation of EViews

# Tables from Baltagi
#
# Table 4.1
############ [statistic (critical values at 5% level)]
## note: SLM statistic is not implemented in plm
#     ind       time    twoways
#---------------------------------
# [...]

##### Grunfeld data set - balanced ####
# Table 4.2 [Output from EViews], similiar to above table but with p-values
##### EViews add-in BPTest for some older version of EViews needed:
##### http://www.eviews.com/Addins/addins.shtml#addins
##### http://forums.eviews.com/viewtopic.php?f=23&t=2228
##### In (at least) EViews 9, the LM tests are implemented, no need for add-in;
#####                         SLM is not outputted anymore but std. Honda and std. KW
#####                         and p-values for negative one-sided statistics are not
#####                         printed anymore (see unbalanced example below).
########### [statistic (p-values)]
#     ind       time    twoways
#---------------------------------
## note: SLM statistic is not implemented in plm
# [...]

############ unbalanced ##########################################
##### Grunfeld unbalanced data set [see below]
##### (last observation deleted, i. e. first 199 obs)
# Own computation with EViews 9
##### In EViews 9, the LM tests are implemented, no need for add-in anymore;
#####              SLM is not outputted but std. Honda and std. KW
#####              p-values for the negative one-sided statistics
#####              [in this example (std.) Honda, KW]
#####              are not printed in EViews 9; from the help file:

########### [statistic (p-values)]
#           ind       time    twoways
#---------------------------------
# [...]
#
## note: standardizised HO statistic is not implemented in plm
## note: standardizised KW statistic is not implemented in plm


options(digits = 10)
Sys.setenv(LANG = "en")
require(plm)
data("Grunfeld", package = "plm")
Grunfeldpdata <- pdata.frame(Grunfeld, index = c("firm", "year"), drop.index = FALSE, row.names = TRUE)

fe_grunfeld    <- plm(inv ~ value + capital, data=Grunfeldpdata, model="within")
re_grunfeld    <- plm(inv ~ value + capital, data=Grunfeldpdata, model="random")
pool_grunfeld  <- plm(inv ~ value + capital, data=Grunfeldpdata, model="pooling")



# Make an unbalanced data set
Grunfeldpdata_unbalanced <- Grunfeld[1:(nrow(Grunfeld)-1), ] 
Grunfeldpdata_unbalanced <- pdata.frame(Grunfeldpdata_unbalanced, index=c("firm"), drop.index = F)
fe_grunfeld_unbalanced   <- plm(inv ~ value + capital, data=Grunfeldpdata_unbalanced, model="within")
re_grunfeld_unbalanced   <- plm(inv ~ value + capital, data=Grunfeldpdata_unbalanced, model="random")
pool_grunfeld_unbalanced <- plm(inv ~ value + capital, data=Grunfeldpdata_unbalanced, model="pooling")


# Produc
# data("Produc", package = "plm")
# form_produc <- formula(gsp ~ log(pc) + log(pcap) + log(emp) + unemp)
# produc_pool <- plm(form_produc, data = Produc, model="pooling")

# Hedonic
# Stastics heavily differ for this unbalanced data, depending on one applies the
# balanced tests (v1.4-0) to this unbalanced data or the unbalanced test
#
# balanced test of v1.4-0:   849.45815 (individual effects) and 600.20821 (time effects)
# unbalanced test:            25.011274 (individual effects) and  1.5571417 (time effects)
data("Hedonic", package = "plm")
pHedonic <- pdata.frame(Hedonic, index = "townid", drop.index = F)
form_hedonic <- formula(mv ~ crim)
hedonic_pool <- plm(form_hedonic, data = pHedonic, model="pooling")
plmtest(hedonic_pool)
plmtest(hedonic_pool, effect = "time")


### generalized version of plmtest() to handle also unbalanced panels

# individual effect
print(honda_ind            <- plmtest(pool_grunfeld, type="honda"))
print(honda_ind_unbalanced <- plmtest(pool_grunfeld_unbalanced, type="honda"))
print(bp_ind               <- plmtest(pool_grunfeld, type="bp"))
print(bp_ind_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="bp"))
print(kw_ind               <- plmtest(pool_grunfeld, type="kw"))
print(kw_ind_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="kw"))
# Note: ghm is only for twoways, hence not in this section

# time effect
print(honda_time            <- plmtest(pool_grunfeld, type="honda", effect="time"))
print(honda_time_unbalanced <- plmtest(pool_grunfeld_unbalanced, type="honda", effect="time"))
print(bp_time               <- plmtest(pool_grunfeld, type="bp", effect="time"))
print(bp_time_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="bp", effect="time"))
print(kw_time               <- plmtest(pool_grunfeld, type="kw", effect="time"))
print(kw_time_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="kw", effect="time"))
# Note: ghm is only for twoways, hence not in this section

# twoways effect
print(honda_tw            <- plmtest(pool_grunfeld, type="honda", effect="twoways"))
print(honda_tw_unbalanced <- plmtest(pool_grunfeld_unbalanced, type="honda", effect="twoways"))
print(bp_tw               <- plmtest(pool_grunfeld, type="bp", effect="twoways"))
print(bp_tw_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="bp", effect="twoways"))
print(kw_tw               <- plmtest(pool_grunfeld, type="kw", effect="twoways"))
print(kw_tw_unbalanced    <- plmtest(pool_grunfeld_unbalanced, type="kw", effect="twoways"))
print(ghm_tw              <- plmtest(pool_grunfeld, type="ghm", effect="twoways"))
print(ghm_tw_unbalanced   <- plmtest(pool_grunfeld_unbalanced, type="ghm", effect="twoways"))



### Test of formula interface

# individual effect
print(honda_ind_form            <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="honda"))
print(honda_ind_unbalanced_form <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="honda"))
print(bp_ind_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="bp"))
print(bp_ind_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="bp"))
print(kw_ind_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="kw"))
print(kw_ind_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="kw"))
# Note: ghm is only for twoways, hence not in this section

# time effect
print(honda_time_form            <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="honda", effect="time"))
print(honda_time_unbalanced_form <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="honda", effect="time"))
print(bp_time_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="bp", effect="time"))
print(bp_time_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="bp", effect="time"))
print(kw_time_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="kw", effect="time"))
print(kw_time_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="kw", effect="time"))
# Note: ghm is only for twoways, hence not in this section

# twoways effect
print(honda_tw_form            <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="honda", effect="twoways"))
print(honda_tw_unbalanced_form <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="honda", effect="twoways"))
print(bp_tw_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="bp", effect="twoways"))
print(bp_tw_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="bp", effect="twoways"))
print(kw_tw_form               <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="kw", effect="twoways"))
print(kw_tw_unbalanced_form    <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="kw", effect="twoways"))
print(ghm_tw_form              <- plmtest(inv ~ value + capital, data=Grunfeldpdata, type="ghm", effect="twoways"))
print(ghm_tw_unbalanced_form   <- plmtest(inv ~ value + capital, data=Grunfeldpdata_unbalanced, type="ghm", effect="twoways"))

# Should all be TRUE
if(!all(
  identical(honda_ind,            honda_ind_form),
  identical(honda_ind_unbalanced, honda_ind_unbalanced_form),
  identical(bp_ind,               bp_ind_form),
  identical(bp_ind_unbalanced,    bp_ind_unbalanced_form),
  identical(kw_ind,               kw_ind_form),  
  identical(kw_ind_unbalanced,    kw_ind_unbalanced_form),
  
  identical(honda_time,            honda_time_form),   
  identical(honda_time_unbalanced, honda_time_unbalanced_form),
  identical(bp_time,               bp_time_form),
  identical(bp_time_unbalanced,    bp_time_unbalanced_form),
  identical(kw_time,               kw_time_form), 
  identical(kw_time_unbalanced,    kw_time_unbalanced_form),
  
  identical(honda_tw,            honda_tw_form),
  identical(honda_tw_unbalanced, honda_tw_unbalanced_form),
  identical(bp_tw,               bp_tw_form),
  identical(bp_tw_unbalanced,    bp_tw_unbalanced_form),
  identical(kw_tw,               kw_tw_form),   
  identical(kw_tw_unbalanced,    kw_tw_unbalanced_form),
  identical(ghm_tw,              ghm_tw_form),   
  identical(ghm_tw_unbalanced,   ghm_tw_unbalanced_form))) stop("results of plm and formula interface differ!")


# Tests - unbalanced - statistics should be "sufficiently different" from balanced statistics,
# thus results should be TRUE

  # individual
  abs(honda_ind_unbalanced$statistic - honda_ind$statistic) > 0.0001
  abs(bp_ind_unbalanced$statistic    - bp_ind$statistic)    > 0.0001
  abs(kw_ind_unbalanced$statistic    - kw_ind$statistic)    > 0.0001

  # time
  abs(honda_time_unbalanced$statistic - honda_time$statistic) > 0.0001
  abs(bp_time_unbalanced$statistic    - bp_time$statistic)    > 0.0001
  abs(kw_time_unbalanced$statistic    - kw_time$statistic)    > 0.0001
  
  # twoways
  abs(honda_tw_unbalanced$statistic - honda_tw$statistic) > 0.0001
  abs(bp_tw_unbalanced$statistic    - bp_tw$statistic)    > 0.0001
  abs(kw_tw_unbalanced$statistic    - kw_tw$statistic)    > 0.0001
  abs(ghm_tw_unbalanced$statistic   - ghm_tw$statistic)   > 0.0001






########## resemble critical values at alpha = 0.05 from Table 4.1 (Baltagi (2013), p. 74)
alpha <- 0.05

#### honda and kw oneway and twoway -> 1.645
qnorm(alpha, lower.tail = F)
# =>
pnorm(qnorm(alpha, lower.tail = F), lower.tail = F)

# honda (kw) p-value implementation as in plm_v1.4-0 (CRAN as of 2015-11-08):
# leads to the 10% level (not 5%):
# see also above the table for the unbalanced Grunfeld data on how EViews handles negative statistics for Honda and KW
pnorm(abs(1.645), lower.tail = FALSE)*2 # CRAN v1.4-0
# correct is -> p=0.05
pnorm(abs(1.645), lower.tail = FALSE)


#### bp: df=1 (oneway) -> 3.841
####     df=2 (twoway) -> 5.991
qchisq(alpha, df=1, lower.tail = F) # H0_a, H0_b
qchisq(alpha, df=2, lower.tail = F) # H0_c
# =>
pchisq(qchisq(alpha, df = 1, lower.tail = F), df=1, lower.tail = F)
pchisq(qchisq(alpha, df = 2, lower.tail = F), df=2, lower.tail = F)



#### ghm    test for p-value of mixed chi-square distribution (more often called chi-bar-square)
#           as implemented in fixed version.
#           (was simple chisquare in plm_v1.4-0 on CRAN -> wrong)
#
# Baltagi (2013), p. 88 (note 2), p. 209 (note 10) gives critical values for 0.01, 0.05, 0.10 levels
# 4.321 is a typo in the notes of Baltagi's textbook, should be 4.231 [confirmed by private email from Badi Baltagi]
crit <- c(7.289, 4.231, 2.952)   # without typo
# crit <- c(7.289, 4.312, 2.952) # with typo from text book
p.vals <- (1/4)*pchisq(crit, df=0, lower.tail = F) + (1/2) * pchisq(crit, df=1, lower.tail = F) + (1/4) * pchisq(crit, df=2, lower.tail = F)


# Baltagi (2013), p. 73, 74 contains another example of the mixed chi-square (chi-bar-square) distribution of another statistic
# The p-values for that example is also reassembled here
crit_2 <- c(2.706) # for alpha=0.05
p.val_2 <- (1/2)*pchisq(crit_2, df=0, lower.tail = F) + (1/2) * pchisq(crit_2, df=1, lower.tail = F)




################# Replicate an example from Stata
## example 1 in this manual:
## http://www.stata.com/manuals/xtxtregpostestimation.pdf
## It is an unbalanced panel

# require(haven) # required to read Stata data file
# nlswork <- read_dta("http://www.stata-press.com/data/r14/nlswork.dta")
# nlswork$race <- factor(nlswork$race) # fix data
# nlswork$race2 <- factor(ifelse(nlswork$race == 2, 1, 0)) # need this variable for example
# pnlswork <- pdata.frame(nlswork, index=c("idcode", "year"), drop.index=F)
# 
# # note STAT 14 uses by default a different method compared to plm's Swamy–Arora variance component estimator
# # This is why in comparison with web examples from Stata the random effects coefficients slightly differ
# plm_re_nlswork <- plm(ln_wage ~ grade + age + I(age^2) + ttl_exp + I(ttl_exp^2) + tenure + I(tenure^2) + race2 + not_smsa + south
#                         , data = pnlswork, model = "random")
# 
# # reassembles the FE estimation by Stata in Example 2 of http://www.stata.com/manuals13/xtxtreg.pdf
# plm_fe_nlswork <- plm(ln_wage ~ grade + age + I(age^2) + ttl_exp + I(ttl_exp^2) + tenure + I(tenure^2) + race2 + not_smsa + south
#                       , data = pnlswork, model = "within")
# 
# plm_pool_nlswork <- plm(ln_wage ~ grade + age + I(age^2) + ttl_exp + I(ttl_exp^2) + tenure + I(tenure^2) + race2 + not_smsa + south
#                       , data = pnlswork, model = "pooling")
# 
# 
# # Reassembles Exmaple 1 in http://www.stata.com/manuals14/xtxtregpostestimation.pdf
# # use modified plmtest() as a wrapper
# options(digits = 10)
# plmtest(plm_pool_nlswork, type="bp")
# 
# # ## Lagrange Multiplier Test - individual effects - Breusch-Pagan Test for unbalanced Panels as in Baltagi/Li (1990)
# # ## data:  ln_wage ~ grade + age + I(age^2) + ttl_exp + I(ttl_exp^2) + tenure +  ...
# # ## BP_unbalanced = 14779.984, df = 1, p-value < 2.2204e-16
# # ## alternative hypothesis: significant effects