R/panelAR.R

Defines functions panelAR

Documented in panelAR

### Main panelAR function
### Author: Konstantin Kashin
### August 25, 2013

panelAR <- function(formula, data, panelVar, timeVar, autoCorr = c("ar1", 
    "none", "psar1"), panelCorrMethod = c("none","phet","pcse","pwls","parks"), rhotype ="breg", bound.rho = FALSE, rho.na.rm = FALSE, panel.weight = c("t-1", "t"), dof.correction = FALSE, complete.case = FALSE, seq.times = FALSE,  singular.ok=TRUE) {
    # save environment
    env.base <- environment()
    
    # save call
    call <- match.call()
    
    # check for presence of a formula and data object
    ind <- match(c("formula","data","timeVar"),names(call),nomatch=0)
    if(ind[1]==0){
    	stop("A formula argument is required.",call.=FALSE)
    }
    if(ind[2]==0){
    	stop("A data argument is required.",call.=FALSE)
    }
    if(ind[3]==0){
    	stop("You must specify time ID using timeVar.",call.=FALSE)
    }
    
    # identify autocorrelation and panel structure
    autoCorr.method <- match.arg(autoCorr)
    panelCorr.method <- match.arg(panelCorrMethod)
    
    # identify 2nd stage method
    pMethod <- switch(panelCorr.method, "none"="OLS", "phet"="OLS", "pcse"="OLS", "pwls"="GLS", "parks"="GLS")
        
    # check rhotype
    rhotype <- match.arg(rhotype,c("breg","freg","dw","theil-nagar","scorr","theil"))
    
    # check panel.weight & rho.na.rm arguments
    panel.weight <- match.arg(panel.weight) 
    if(!is.logical(rho.na.rm)){
    	stop("rho.na.rm should be logical argument.")
    }
    
    # make sure that timevar is in data object; check for NA values
   	if (!(timeVar %in% colnames(data))) {
        stop("Please make sure that timeVar is in the data object.",call.=FALSE)
    }
    # extract time vector and check for NA values
    time.vec <- data[, timeVar]
    if (any(is.na(time.vec))) {
       stop("You cannot have NA values for the time variable.",call.=FALSE)
    }
    if (!is.integer(time.vec) & !is.numeric(time.vec)) {
       stop("The time variable must be defined as an integer.",call.=FALSE)
    }

    # extract panel variable and check if is.null
    if (!is.null(panelVar)){
    	if (!(panelVar %in% colnames(data))) {
        stop("Please make sure that panelVar is in the data object.",call.=FALSE)
    	} else{
    	 panel.vec <- as.character(as.vector(data[, panelVar]))
    	 if (any(is.na(panel.vec))) {
    		stop("You cannot have NA values for the panel ID variable.",call.=FALSE)
    	 }
    	}
    } else{   
    	panel.vec <- rep("1",nrow(data))
    	if(panelCorr.method!="none") warning("Without specifying a panel ID variable, data is assumed to come from a single panel and variance is assumed to be homoskedastic within that panel. Panel heteroskedasticity and/or correlation is ignored.",call.=FALSE)
    }
    
    # check for duplicate times in each panel
    if (!is.null(timeVar)) {
        if (any(by(time.vec, panel.vec, function(x) any(table(x) > 1)))) {
           stop("You must specify unique times for each observation in a given panel.",call.=FALSE)
        }
    }
    
    # sort dataframe
    order.index <- order(panel.vec, time.vec)
    data <- data[order.index, ]
    panel.vec <- panel.vec[order.index]
    time.vec <- time.vec[order.index]
    
    # run OLS and extract results
    lm.out <- lm(formula = formula, data = data,singular.ok=singular.ok)
    
    # extract terms
    mterms <- lm.out$terms
    # aliased coefficients
    aliased <- is.na(coef(lm.out))
    # get model matrix, frame, and response from OLS regression
    X <- model.matrix(lm.out)[,!aliased]
    mf <- model.frame(lm.out)
    y <- model.response(mf)
    # create yX matrix
    yX <- cbind(y, X)
    # extract initial residuals from OLS regression
    original.e <- residuals(lm.out)
    # save variable names
    var.names <- colnames(X)
    # number of observations in regression
    N <- length(y)
    
	# get rank and residual dof (rdf)
	rank <- lm.out$rank
	rdf <- N-rank
	    
    # lm automatically row-deletes missing data 
    # so we need to do same for panelVar and timeVar
    # store na.action
    obs.dropped <- lm.out$na.action
    if (!is.null(obs.dropped)) {
        data <- data[-obs.dropped, ]
        panel.vec <- panel.vec[-obs.dropped]
        time.vec <- time.vec[-obs.dropped]
    }
    
    # generate time sequentially if seq.times
    if (seq.times) {
 		time.vec <- as.vector(unlist(by(data, panel.vec, function(x) 1:nrow(x))))
        data[, timeVar] <- time.vec # not sure if we need this
    }
    
    # sort units and times
    units <- sort(unique(panel.vec))
    times <- sort(unique(time.vec))
    # number of units and times
    N.units <- length(units)
    N.times <- length(times)
    # average number of units per panel
    N.avgperpanel <- N/N.units
    
    # stop if just 1 time period
    if(N.times<2){
    	stop("More than two time periods required.",call.=FALSE)
    }
    
    # check on Parks method
    if(panelCorr.method=="parks" & (N.units > N.times)){
    	stop("Cannot estimate Parks-Kmenta method because of singularity.")
    }
    
    # expected number of observations if panels balanced
    NT <- N.units * N.times
    # check if balanced
    balanced <- ifelse(N == NT, TRUE, FALSE)
    
    ### create observations matrix: rows are units and columns are times
    # dimensions: N_p x T
    # if cell is TRUE, panel i at time t is observed
    obs.mat <- reshape(cbind(data[, c(panelVar, timeVar)], TRUE), timevar = timeVar, 
    idvar = panelVar, direction = "wide", new.row.names = units)[,-1]
    col.order <- order(as.integer(gsub("TRUE.", "", colnames(obs.mat))))
    obs.mat <- obs.mat[, col.order]
    colnames(obs.mat) <- times
    obs.mat[is.na(obs.mat)] <- FALSE
    obs.mat <- as.matrix(obs.mat)
    
    ### runs analysis: generate number of runs by panel
    # output is vector of counts of number of runs per panel
    # length of vector is N_p
    N.runs.panel <- apply(obs.mat, MARGIN = 1, function(x) sum(rle(x)$values == TRUE))
    if (any(N.runs.panel > 1)) {
        message(paste("The following units have non-consecutive observations. Use runs.analysis() on output for additional details: ", paste(names(N.runs.panel[N.runs.panel>1]),collapse=", "),".",sep=""))
    }
    
    ### reshape residuals into T x N_p matrix
    # matrix will have NAs if unbalanced design
    e.mat <- matrix(NA, nrow = ncol(obs.mat), ncol = nrow(obs.mat))
    e.mat[t(obs.mat) == TRUE] <- original.e
   
   	### Run Prais-Winsten correction
    pw.output <- prais.correct(method = autoCorr.method,env.base=env.base)

    ### Extract residuals from Prais-Winsten regression
    transformed.resids <- pw.output$pw.lm$residuals
    ### Extract model matrix and response from Prais-Winsten regression
    model.mat.pw <- model.matrix(pw.output$pw.lm)
        
    ### In cases where rhos not bounded to [-1,1], we lose first observation
    ### so need to remake observation matrix
    # observation matrix is now T x N_p
    obs.mat.pw <- t(obs.mat)
    
    # create new observation matrix and update panel and time ID vectors 
    if (!is.null(pw.output$pw.lm$na.action)) {
        panel.vec.pw <- panel.vec[-pw.output$pw.lm$na.action]
        time.vec.pw <- time.vec[-pw.output$pw.lm$na.action]
        obs.mat.pw[obs.mat.pw == TRUE][pw.output$pw.lm$na.action] <- FALSE
    } else {
        panel.vec.pw <- panel.vec
        time.vec.pw <- time.vec
    }
    
    ### Panel homoskedasticity
    if (panelCorr.method == "none") {
    		# calculate residual variance
        sigma <- mean(transformed.resids^2)
        # N_p x N_p matrix of panel covariances
        Sigma <- diag(sigma, N.units)
        # matrix of residual covariances
        Omega <- diag(sigma, nrow(model.mat.pw))
        # number of panel covariances calculated
        N.cov <- 1
        # run estimation
        res <- switch(pMethod,OLS=ols(env.base), GLS=gls(env.base))
      ### Panel heteroskedasticity
      } else if (panelCorr.method == "phet"|panelCorr.method == "pwls") {
      	# N_p x N_p matrix of panel covariances
      	sigma.vec <- as.vector(by(transformed.resids, panel.vec.pw, function(x) mean(x^2)))
        if(length(sigma.vec)>1){
        	Sigma <- diag(sigma.vec)
        } else{
        	Sigma <- sigma.vec
        	}
        # matrix of residual covariances
        Omega <- diag(rep(sigma.vec, times = as.integer(table(panel.vec.pw))))
        # number of panel covariances calculated
        N.cov <- length(unique(panel.vec.pw))
        # run estimation
        res <- switch(pMethod,OLS=ols(env.base), GLS=gls(env.base))
    } else {
      
      ### Correlated panels / PCSE
        # if balanced panel design
        if (balanced) {
            # set up new T x N_p matrix for residuals
            E <- matrix(transformed.resids, nrow = N.times, ncol = N.units, byrow = FALSE)
            # E'E
            E.E <- crossprod(E)
            # N_p x N_p matrix which 
            # gives number of overlapping time observations between each panel
            weight.mat <- crossprod(obs.mat.pw)
        } else {
            
            # set up new T x N_p matrix for residuals
            E <- obs.mat.pw
            # set missing residuals to 0
            E[E == TRUE] <- transformed.resids
            E[E == FALSE] <- 0
            
            # complete case calculation of E'E and weight.mat
            if (complete.case) {
            		# average times per panel
            	    # select only those time periods with complete cases
                I.com.case <- apply(obs.mat.pw, MARGIN = 1, function(x) prod(x) == 1)
                if (!any(I.com.case)) {
                  stop("Unable to compute correlated SEs / PCSEs because there are no time periods in common across all units. Instead, consider setting complete.case=FALSE.",call.=FALSE)
                } else {
                  if(sum(I.com.case)<(0.5*N.avgperpanel)){
                  	warning(paste("The number of time periods used for the calculation of correlated SEs / PCSEs (",as.character(sum(I.com.case)),") is less than half the average number of time periods per panel (",as.character(round(N.avgperpanel,digits=2)),"). Consider setting complete.case=FALSE.",sep=""),call.=FALSE)
                  }
                  E[!I.com.case, ] <- 0
                  E.E <- crossprod(E)
                  weight.mat <- matrix(data = sum(I.com.case), nrow = nrow(E.E), ncol = nrow(E.E))
                }
            } else {
            	# pairwise option
                E.E <- crossprod(E)
                weight.mat <- crossprod(obs.mat.pw)
            }
        }
        
        # N_p x N_p matrix of panel covariances
        Sigma <- E.E/weight.mat
        # Number of panel covariances calculated
        N.cov <- length(Sigma[lower.tri(Sigma, diag = TRUE)])
        # assume covariance is 0 between panels that do not overlap in time periods
        Sigma <- replace(Sigma, is.na(Sigma), 0)
	
		# matrix of residual covariances
        Omega <- kronecker(Sigma, diag(1, N.times))
        if (!balanced) {
            Omega <- Omega[as.vector(obs.mat.pw), as.vector(obs.mat.pw)]
        }
        
        # run estimation
        res <- switch(pMethod,OLS=ols(env.base), GLS=gls(env.base))
    }
	# end of correlated SE calculations    

	# clean up coef vector and var-covar matrix
    coef <- as.vector(res$coef)
    vcov <- res$vcov
    colnames(vcov) <- rownames(vcov) <- names(coef) <- var.names
    
    # calculate yhat
    yhat <- as.vector(X %*% coef)
    names(yhat) <- row.names(X)
    
    # calculate residuals as y-yhat
    resids <- y - yhat
    
    # dof correction to vcov
    if (dof.correction) {
        vcov <- vcov * (N/(N - rank))
    }
    
    # create panelStructure list
    panelStructure <- list(obs.mat = obs.mat, rho = pw.output$pw.rho, Sigma = Sigma, N.cov=N.cov)
    
    if(autoCorr.method=="psar1"){
    names(panelStructure$rho) <- rownames(obs.mat)
    	}
    
    # R2 (if ols)
    if(pMethod=="OLS"){
     transform.y.vec <- model.response(model.frame(pw.output$pw.lm))
   	 r2 <- 1 - sum(transformed.resids^2)/sum((transform.y.vec-mean(transform.y.vec))^2)
    } else{
    		r2 <- NULL
    	}
    
    # insert panelVar and timeVar into model frame for output
    mf[,panelVar] <- panel.vec
    mf[,timeVar] <- time.vec
    
    # create output list
    fit <- list(coefficients = coef, residuals = resids,  fitted.values = yhat, rank = rank,
    				df.residual = rdf,call = call,terms = mterms, model = mf,
    				aliased=aliased, na.action = obs.dropped, vcov=vcov, 
    				r2=r2, panelStructure = panelStructure)
    class(fit) <- "panelAR"
    return(fit)
}  # end of panelAR fxn

Try the panelAR package in your browser

Any scripts or data that you put into this service are public.

panelAR documentation built on May 1, 2019, 8:19 p.m.