R/est_plm.R

Defines functions r.squared_no_intercept r.squared tss.plm tss.default tss plm.fit plm starX

Documented in plm r.squared

starX <- function(formula, data, model, rhs = 1, effect){
  # non-exported, used for IV estimations "am" and "bms"
  # produces a column per time period with the (transformed) data
  # NB: function is not symmetric in individual and time effect
    apdim <- pdim(data)
    amatrix <- model.matrix(data, model, effect, rhs)
    T <- apdim$nT$T
    N <- apdim$nT$n
    if (apdim$balanced){
        result <- Reduce("cbind",
                        lapply(seq_len(ncol(amatrix)),
                               function(x)
                               matrix(amatrix[ , x], 
                                      ncol = T, byrow = TRUE)[rep(1:N, each = T), ]))
    }
    else{ # unbalanced
        Ti <- apdim$Tint$Ti
        result <- lapply(seq_len(ncol(amatrix)), function(x)
                     structure(amatrix[ , x], index = index(data),
                               class = c("pseries", class(amatrix[ , x]))))
        result <- Reduce("cbind", lapply(result, as.matrix))
        result <- result[rep(1:N, times = Ti), ]
        result[is.na(result)] <- 0
    }
    result
}


# Regards plm man page: some elements not listed here...: "assign", "contrast",
# etc... \item{na.action}{if relevant, information about handling of
# NAs by the  model.frame function,}
# NB: na.action is currently not included as it is not supported


#' Panel Data Estimators
#' 
#' Linear models for panel data estimated using the `lm` function on
#' transformed data.
#' 
#' `plm` is a general function for the estimation of linear panel
#' models.  It supports the following estimation methods: pooled OLS
#' (`model = "pooling"`), fixed effects (`"within"`), random effects
#' (`"random"`), first--differences (`"fd"`), and between
#' (`"between"`). It supports unbalanced panels and two--way effects
#' (although not with all methods).
#' 
#' For random effects models, four estimators of the transformation
#' parameter are available by setting `random.method` to one of
#' `"swar"` \insertCite{SWAM:AROR:72}{plm} (default), `"amemiya"`
#' \insertCite{AMEM:71}{plm}, `"walhus"`
#' \insertCite{WALL:HUSS:69}{plm}, or `"nerlove"`
#' \insertCite{NERLO:71}{plm} (see below for Hausman-Taylor instrumental
#' variable case).
#' 
#' The nested random effect model (\insertCite{BALT:SONG:JUNG:01}{plm}) 
#' is estimated by setting `model = "random"` and `effect = "nested"`, 
#' requiring the data to be indexed by a third index  in which the "individual" 
#' dimension is nested (see section *Examples* and the vignette 
#' "Estimation of error components models with the plm function".)
#' 
#' For first--difference models, the intercept is maintained (which
#' from a specification viewpoint amounts to allowing for a trend in
#' the levels model). The user can exclude it from the estimated
#' specification the usual way by adding `"-1"` to the model formula.
#' 
#' Instrumental variables estimation is obtained using two--part
#' formulas, the second part indicating the instrumental variables
#' used. This can be a complete list of instrumental variables or an
#' update of the first part. If, for example, the model is `y ~ x1 +
#' x2 + x3`, with `x1` and `x2` endogenous and `z1` and `z2` external
#' instruments, the model can be estimated with:
#' 
#' \itemize{
#' \item `formula = y~x1+x2+x3 | x3+z1+z2`,
#' \item `formula = y~x1+x2+x3 | . -x1-x2+z1+z2`.
#' }
#' 
#' If an instrument variable estimation is requested, argument
#' `inst.method` selects the instrument variable transformation
#' method:
#'
#' - `"bvk"` (default) for \insertCite{BALE:VARA:87;textual}{plm},
#' - `"baltagi"` for \insertCite{BALT:81;textual}{plm},
#' - `"am"` for \insertCite{AMEM:MACU:86;textual}{plm},
#' - `"bms"` for \insertCite{BREU:MIZO:SCHM:89;textual}{plm}.
#' 
#' The Hausman--Taylor estimator \insertCite{HAUS:TAYL:81}{plm} is
#' computed with arguments `random.method = "ht"`, `model = "random"`,
#' `inst.method = "baltagi"` (the other way with only `model = "ht"`
#' is deprecated).
#' 
#' See also the vignettes for introductions to model estimations (and more) with
#' examples.
#' 
#' @aliases plm
#' @param formula a symbolic description for the model to be
#'     estimated,
#' @param x,object an object of class `"plm"`,
#' @param data a `data.frame`,
#' @param subset see [stats::lm()],
#' @param weights see [stats::lm()],
#' @param na.action see [stats::lm()]; currently, not fully
#'     supported,
#' @param effect the effects introduced in the model, one of
#'     `"individual"`, `"time"`, `"twoways"`, or
#'     `"nested"`,
#' @param model one of `"pooling"`, `"within"`,
#'     `"between"`, `"random"` `"fd"`, or `"ht"`,
#' @param random.method method of estimation for the variance
#'     components in the random effects model, one of `"swar"`
#'     (default), `"amemiya"`, `"walhus"`, `"nerlove"`; for
#'     Hausman-Taylor estimation set to `"ht"` (see Details and Examples), 
#' @param random.models an alternative to the previous argument, the
#'     models used to compute the variance components estimations are
#'     indicated,
#' @param random.dfcor a numeric vector of length 2 indicating which
#'     degree of freedom should be used,
#' @param inst.method the instrumental variable transformation: one of
#'     `"bvk"`, `"baltagi"`, `"am"`, or `"bms"` (see also Details),
#' @param index the indexes,
#' @param restrict.matrix a matrix which defines linear restrictions
#'     on the coefficients,
#' @param restrict.rhs the right hand side vector of the linear
#'     restrictions on the coefficients,
#' @param digits number of digits for printed output,
#' @param width the maximum length of the lines in the printed output,
#' @param dx the half--length of the individual lines for the plot
#'     method (relative to x range),
#' @param N the number of individual to plot,
#' @param seed the seed which will lead to individual selection,
#' @param within if `TRUE`, the within model is plotted,
#' @param pooling if `TRUE`, the pooling model is plotted,
#' @param between if `TRUE`, the between model is plotted,
#' @param random if `TRUE`, the random effect model is plotted,
#' @param formula. a new formula for the update method,
#' @param evaluate a boolean for the update method, if `TRUE` the
#'     updated model is returned, if `FALSE` the call is returned,
#' @param \dots further arguments.
#' 
#' @return An object of class `"plm"`.
#'
#' 
#' A `"plm"` object has the following elements :
#'
#' \item{coefficients}{the vector of coefficients,}
#' \item{vcov}{the variance--covariance matrix of the coefficients,}
#' \item{residuals}{the vector of residuals (these are the residuals
#' of the (quasi-)demeaned model),}
#' \item{weights}{(only for weighted estimations) weights as
#' specified,}
#' \item{df.residual}{degrees of freedom of the residuals,}
#' \item{formula}{an object of class `"Formula"` describing the model,}
#' \item{model}{the model frame as a `"pdata.frame"` containing the
#' variables used for estimation: the response is in first column followed by
#' the other variables, the individual and time indexes are in the 'index'
#' attribute of `model`,}
#' \item{ercomp}{an object of class `"ercomp"` providing the
#' estimation of the components of the errors (for random effects
#' models only),}
#' \item{aliased}{named logical vector indicating any aliased
#' coefficients which are silently dropped by `plm` due to
#' linearly dependent terms (see also [detect.lindep()]),}
#' \item{call}{the call.}
#' 
#' 
#' It has `print`, `summary` and `print.summary` methods. The
#' `summary` method creates an object of class `"summary.plm"` that
#' extends the object it is run on with information about (inter alia) F
#' statistic and (adjusted) R-squared of model, standard errors, t--values, and
#' p--values of coefficients, (if supplied) the furnished vcov, see
#' [summary.plm()] for further details.
#' @import Formula
#' @importFrom stats alias approx as.formula coef coefficients cor delete.response
#' @importFrom stats deviance df.residual dnorm fitted formula lm lm.fit model.frame
#' @importFrom stats model.matrix model.response model.weights na.omit pchisq pf
#' @importFrom stats pnorm printCoefmat pt qnorm reshape resid residuals sd terms
#' @importFrom stats update var vcov
#' @importFrom grDevices heat.colors rainbow
#' @importFrom graphics abline axis barplot legend lines plot points
#' @export
#' @author Yves Croissant
#' @seealso [summary.plm()] for further details about the associated
#' summary method and the "summary.plm" object both of which provide some model
#' tests and tests of coefficients.  [fixef()] to compute the fixed
#' effects for "within" models (=fixed effects models). [predict.plm()] for 
#' predicted values.
#' @references
#'
#' \insertRef{AMEM:71}{plm}
#'
#' \insertRef{AMEM:MACU:86}{plm}
#'
#' \insertRef{BALE:VARA:87}{plm}
#'
#' \insertRef{BALT:81}{plm}
#'
#' \insertRef{BALT:SONG:JUNG:01}{plm}
#'
#' \insertRef{BALT:13}{plm}
#'
#' \insertRef{BREU:MIZO:SCHM:89}{plm}
#'
#' \insertRef{HAUS:TAYL:81}{plm}
#'
#' \insertRef{NERLO:71}{plm}
#'
#' \insertRef{SWAM:AROR:72}{plm}
#'
#' \insertRef{WALL:HUSS:69}{plm}
#' 
#' @keywords regression
#' @examples
#' 
#' data("Produc", package = "plm")
#' zz <- plm(log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp,
#'           data = Produc, index = c("state","year"))
#' summary(zz)
#' 
#' # replicates some results from Baltagi (2013), table 3.1
#' data("Grunfeld", package = "plm")
#' p <- plm(inv ~ value + capital,
#'          data = Grunfeld, model = "pooling")
#' 
#' wi <- plm(inv ~ value + capital,
#'           data = Grunfeld, model = "within", effect = "twoways")
#' 
#' swar <- plm(inv ~ value + capital,
#'             data = Grunfeld, model = "random", effect = "twoways")
#' 
#' amemiya <- plm(inv ~ value + capital,
#'                data = Grunfeld, model = "random", random.method = "amemiya",
#'                effect = "twoways")
#' 
#' walhus <- plm(inv ~ value + capital,
#'               data = Grunfeld, model = "random", random.method = "walhus",
#'               effect = "twoways")
#' 
#' # summary and summary with a furnished vcov (passed as matrix, 
#' # as function, and as function with additional argument)
#' summary(wi)
#' summary(wi, vcov = vcovHC(wi))
#' summary(wi, vcov = vcovHC)
#' summary(wi, vcov = function(x) vcovHC(x, method = "white2"))
#' 
#' 
#' ## nested random effect model
#' # replicate Baltagi/Song/Jung (2001), p. 378 (table 6), columns SA, WH
#' # == Baltagi (2013), pp. 204-205
#' data("Produc", package = "plm")
#' pProduc <- pdata.frame(Produc, index = c("state", "year", "region"))
#' form <- log(gsp) ~ log(pc) + log(emp) + log(hwy) + log(water) + log(util) + unemp
#' summary(plm(form, data = pProduc, model = "random", effect = "nested"))
#' summary(plm(form, data = pProduc, model = "random", effect = "nested",
#'             random.method = "walhus"))
#' 
#' ## Instrumental variable estimations
#' # replicate Baltagi (2013/2021), p. 133/162, table 7.1
#' data("Crime", package = "plm")
#' FE2SLS <- plm(lcrmrte ~ lprbarr + lpolpc + lprbconv + lprbpris + lavgsen +
#'                 ldensity + lwcon + lwtuc + lwtrd + lwfir + lwser + lwmfg + lwfed +
#'                 lwsta + lwloc + lpctymle + lpctmin + region + smsa + factor(year)
#'               | . - lprbarr - lpolpc + ltaxpc + lmix,
#'               data = Crime, model = "within")
#' G2SLS <- update(FE2SLS, model = "random", inst.method = "bvk")
#' EC2SLS <- update(G2SLS, model = "random", inst.method = "baltagi")
#' 
#' ## Hausman-Taylor estimator and Amemiya-MaCurdy estimator
#' # replicate Baltagi (2005, 2013), table 7.4; Baltagi (2021), table 7.5
#' data("Wages", package = "plm")
#' ht <- plm(lwage ~ wks + south + smsa + married + exp + I(exp ^ 2) + 
#'               bluecol + ind + union + sex + black + ed |
#'               bluecol + south + smsa + ind + sex + black |
#'               wks + married + union + exp + I(exp ^ 2), 
#'           data = Wages, index = 595,
#'           random.method = "ht", model = "random", inst.method = "baltagi")
#' summary(ht)
#' 
#' am <- plm(lwage ~ wks + south + smsa + married + exp + I(exp ^ 2) + 
#'               bluecol + ind + union + sex + black + ed |
#'               bluecol + south + smsa + ind + sex + black |
#'               wks + married + union + exp + I(exp ^ 2), 
#'           data = Wages, index = 595,
#'           random.method = "ht", model = "random", inst.method = "am")
#' summary(am)
#'
plm <- function(formula, data, subset, weights, na.action,
                effect = c("individual", "time", "twoways", "nested"),
                model = c("within", "random", "ht", "between", "pooling", "fd"),
                random.method = NULL,
                random.models = NULL,
                random.dfcor = NULL,
                inst.method = c("bvk", "baltagi", "am", "bms"),
                restrict.matrix = NULL,
                restrict.rhs = NULL,
                index = NULL,
                ...){

    if (is.list(formula)){
        # if the first argument is a list (of formulas), then call plmlist and early exit
        plmlist <- match.call(expand.dots = FALSE)
        plmlist[[1L]] <- as.name("plm.list")
        # eval in nframe and not the usual parent.frame(), relevant?
        nframe <- length(sys.calls())
        plmlist <- eval(plmlist, sys.frame(which = nframe))
        return(plmlist)
    }

    if ((! is.null(restrict.matrix) || ! is.null(restrict.rhs)) && ! is.list(formula)) {
        stop(paste0("arguments 'restrict.matrix' and 'restrict.rhs' cannot yet be used ",
                    "for single equations"))
    }
    
    # match and check the effect and model arguments
    effect <- match.arg(effect)
    inst.method <- match.arg(inst.method)
    
    # note that model can be NA, in this case the model.frame is returned
    anyNA.model <- anyNA(model)
    if (! anyNA.model) model <- match.arg(model) 
    if (! anyNA.model && effect == "nested" && model != "random") {
      # input check for nested RE model
      stop(paste0("effect = \"nested\" only valid for model = \"random\", but input is model = \"",
                     model, "\"."))
      }
    
    if (! anyNA.model && model == "fd") {
      # input checks for FD model: give informative error messages as
      # described in footnote in vignette
        if(effect == "time") stop(paste("effect = \"time\" for first-difference model",
                                        "meaningless because cross-sections do not",
                                        "generally have a natural ordering"))
        if(effect == "twoways") stop(paste("effect = \"twoways\" is not defined",
                                           "for first-difference models"))
    }
    
    # Deprecated section
      
      # model = "ht" in plm() and pht() are no longer maintained, but working
      # -> call pht() and early exit
      if(! anyNA.model && model == "ht"){
          ht <- match.call(expand.dots = FALSE)
          m <- match(c("formula", "data", "subset", "na.action", "index"), names(ht), 0)
          ht <- ht[c(1L, m)]
          ht[[1L]] <- as.name("pht")
          ht <- eval(ht, parent.frame())
          return(ht)
      }
    
    orig_rownames <- row.names(data) # save original rownames for later restoring
    
    # check whether data and formula are pdata.frame and Formula and if not
    # coerce them
    
    # convert data to pdata.frame; if already pdata.frame input, do a sanity check
    if(!inherits(data, "pdata.frame")) {
      data <- pdata.frame(data, index = index, ...)
      } else is.pdata.frame(data, feedback = "warn")
        
    if(!inherits(formula, "Formula")) formula <- as.Formula(formula)

    # in case of 2-part formula, check whether the second part should
    # be updated, e.g., y ~ x1 + x2 + x3 | . - x2 + z becomes 
    # y ~ x1 + x2 + x3 | x1 + x3 + z
    # use length(formula)[2] because the length is now a vector of length 2
#    if (length(formula)[2] == 2) formula <- expand.formula(formula)
    
    # eval the model.frame
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("data", "formula", "subset", "weights", "na.action"), names(mf), 0)
    mf <- mf[c(1L, m)]
    names(mf)[2:3] <- c("formula", "data")
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- as.name("model.frame")
    # use the Formula and pdata.frame which were created if necessary (and not
    # the original formula / data)
    mf$formula <- data
    mf$data <- formula
    data <- eval(mf, parent.frame())
    
    # preserve original row.names for data [also fancy rownames]; so functions
    # like pmodel.response(), model.frame(), model.matrix(), residuals() return
    # the original row.names eval(mf, parent.frame()) returns row.names as
    # character vector containing the "row_number" with incomplete observations
    # dropped
    # row.names(data) <- orig_rownames[as.numeric(row.names(data))]
    ## TODO make this cleaner. This is a dirty workaround to enable sandwich::vcovBS
    ##      on plm objects (vcovBS(fe_complete, cluster=~firm, R = 999))
    attr(data, "row.names") <- orig_rownames[as.numeric(row.names(data))]

    # if model = NA return the model.frame (via early exit), else continue to estimate model
    if (is.na(model)){
        attr(data, "formula") <- formula
        return(data)
    }

    # note that the model.frame has as attributes the Formula and the index
    # data.frame
    args <- list(model = model, effect = effect,
                 random.method = random.method,
                 random.models = random.models,
                 random.dfcor = random.dfcor,
                 inst.method = inst.method)
    result <- plm.fit(data, model, effect, random.method,
                      random.models, random.dfcor, inst.method)
    result$call <- cl
    result$args <- args
    result
}

plm.fit <- function(data, model, effect, random.method, 
                    random.models, random.dfcor, inst.method){
    formula <- attr(data, "formula")
    # check for 0 cases like in stats::lm.fit (e.g., due to NA dropping) 
    if (nrow(data) == 0L) stop("0 (non-NA) cases")

    # if a random effect model is estimated, compute the error components
    if (model == "random"){
      ## stopping control
      if (length(formula)[2L] > 1L && effect == "twoways")
        stop(paste("Instrumental variable random effect estimation",
                   "not implemented for two-ways panels"))
      
        is.balanced <- is.pbalanced(data)
        estec <- ercomp(data, effect, method = random.method,
                        models = random.models, dfcor = random.dfcor)
        sigma2 <- estec$sigma2
        theta <- estec$theta
    } else theta <- NULL

    # For all models except the unbalanced twoways random model, the
    # estimator is obtained as a linear regression on transformed data
    if (! (model == "random" && effect == "twoways" && ! is.balanced)){
        # extract the model.matrix and the model.response actually, this can be
        # done by providing model.matrix and pmodel.response's methods
        # to pdata.frames
        X <- model.matrix(data, rhs = 1, model = model, 
                          effect = effect, theta = theta, cstcovar.rm = "all")
        y <- pmodel.response(data, model = model, 
                             effect = effect, theta = theta)
        if (ncol(X) == 0L) stop("empty model")
        
        w <- model.weights(data)
        if (! is.null(w)){
            if (! is.numeric(w)) stop("'weights' must be a numeric vector")
            if (any(w < 0) || anyNA(w)) stop("missing or negative weights not allowed")
            X <- X * sqrt(w)
            y <- y * sqrt(w)
        }
        else w <- 1
        
        # IV case: extract the matrix of instruments if necessary
        # (means here that we have a multi-parts formula)
        if (length(formula)[2L] > 1L) {
          
            if(!is.null(model.weights(data)) || any(w != 1))
              stop("argument 'weights' not yet implemented for instrumental variable models")
            
          if ( ! (model == "random" && inst.method != "bvk")) {
           #  Pool/FD/FE/BE IV and RE "bvk" IV estimator
            if (length(formula)[2L] == 2L) {
                  W <- model.matrix(data, rhs = 2,
                                    model = model, effect = effect,
                                    theta = theta, cstcovar.rm = "all")
              }
              else {
                  W <- model.matrix(data, rhs = c(2, 3),
                                    model = model, effect = effect,
                                    theta = theta, cstcovar.rm = "all")
              }
          }
          
          if (model == "random" && inst.method != "bvk") {
           # IV estimators RE "baltagi", "am", and "bms"
            X <- X / sqrt(sigma2["idios"])
            y <- y / sqrt(sigma2["idios"])
            W1 <- model.matrix(data, rhs = 2,
                               model = "within", effect = effect,
                               theta = theta, cstcovar.rm = "all")
            B1 <- model.matrix(data, rhs = 2,
                               model = "Between", effect = effect, 
                               theta = theta, cstcovar.rm = "all")

            if (inst.method %in% c("am", "bms"))
              StarW1 <- starX(formula, data, rhs = 2, model = "within", effect = effect)
            
            if (length(formula)[2L] == 3L) {
              # eval. 3rd part of formula, if present
              W2 <- model.matrix(data, rhs = 3,
                                 model = "within", effect = effect, 
                                 theta = theta, cstcovar.rm = "all")
              
              if (inst.method == "bms")
                StarW2 <- starX(formula, data, rhs = 3, model = "within", effect = effect)
            } 
            else W2 <- StarW2 <- NULL
            
            # TODO: here, some weighting is done but prevented earlier by stop()?!
            #       also: RE bvk/BE/FE IV do not have weighting code.
            if (inst.method == "baltagi") W <- sqrt(w) * cbind(W1, W2, B1)
            if (inst.method == "am")      W <- sqrt(w) * cbind(W1, W2, B1, StarW1)
            if (inst.method == "bms")     W <- sqrt(w) * cbind(W1, W2, B1, StarW1, StarW2)
          }
      
          if (ncol(W) < ncol(X)) stop("insufficient number of instruments")
        } # END all IV cases
        else W <- NULL # no instruments (no IV case)
        
        result <- mylm(y, X, W)
        df <- df.residual(result)
        vcov <- result$vcov
        aliased <- result$aliased
        
        # in case of a within estimation, correct the degrees of freedom
        if (model == "within"){
            pdim <- pdim(data)
            card.fixef <- switch(effect,
                                 "individual" = pdim$nT$n,
                                 "time"       = pdim$nT$T,
                                 "twoways"    = pdim$nT$n + pdim$nT$T - 1
                                 )
            df <- df.residual(result) - card.fixef
            vcov <- result$vcov * df.residual(result) / df
        }
        
        result <- list(coefficients = coef(result),
                       vcov         = vcov,
                       residuals    = resid(result),
                       weights      = w,
                       df.residual  = df,
                       formula      = formula,
                       model        = data)
        
        if (is.null(model.weights(data))) result$weights <- NULL
        if (model == "random") result$ercomp <- estec
    }
    else {
        # random twoways unbalanced:
        pdim <- pdim(data)
        TS <- pdim$nT$T
        theta <- estec$theta$id
        phi2mu <- estec$sigma2["time"] / estec$sigma2["idios"]
        Dmu <- model.matrix( ~ unclass(index(data))[[2L]] - 1)
        attr(Dmu, "index") <- index(data)
        Dmu <- Dmu - theta * Between(Dmu, "individual")
        X <- model.matrix(data, rhs = 1, model = "random", 
                          effect = "individual", theta = theta)
        y <- pmodel.response(data, model = "random", 
                             effect = "individual", theta = theta)
        P <- solve(diag(TS) + phi2mu * crossprod(Dmu))
        phi2mu.CPXDmu.P <- phi2mu * crossprod(X, Dmu) %*% P
        XPX <- crossprod(X)    - phi2mu.CPXDmu.P %*% crossprod(Dmu, X)
        XPy <- crossprod(X, y) - phi2mu.CPXDmu.P %*% crossprod(Dmu, y)
        gamma <- solve(XPX, XPy)[ , , drop = TRUE]

        # residuals 'e' are not the residuals of a quasi-demeaned
        # model but of the 'outer' model
        e <- pmodel.response(data, model = "pooling", effect = effect) -
            as.numeric(model.matrix(data, rhs = 1, model = "pooling") %*% gamma)
        
        result <- list(coefficients = gamma,
                       vcov         = solve(XPX),
                       formula      = formula,
                       model        = data,
                       ercomp       = estec,
                       df.residual  = nrow(X) - ncol(X),
                       residuals    = e)
        
        # derive 'aliased' information (this is based on the assumption that
        # estimation fails anyway if singularities are present).
        aliased <- is.na(gamma)
    }
    result$assign <- attr(X, "assign")
    result$contrasts <- attr(X, "contrasts")
    result$args <- list(model = model, effect = effect)
    result$aliased <- aliased
    class(result) <- c("plm", "panelmodel")
    result
}


tss <- function(x, ...){
  UseMethod("tss")
}

#' @rawNamespace S3method(tss, default)
tss.default <- function(x, ...){
  # always gives centered TSS (= demeaned TSS)
  var(x) * (length(x) - 1)
}

#' @rawNamespace S3method(tss, plm)
tss.plm <- function(x, model = NULL, ...){
    if(is.null(model)) model <- describe(x, "model")
    effect <- describe(x, "effect")
    if(model == "ht") model <- "pooling"
    theta <- if(model == "random") x$ercomp$theta else NULL
    tss(pmodel.response(x, model = model, effect = effect, theta = theta))
}

#' R squared and adjusted R squared for panel models
#' 
#' This function computes R squared or adjusted R squared for plm objects. It
#' allows to define on which transformation of the data the (adjusted) R
#' squared is to be computed and which method for calculation is used.
#' 
#' 
#' @param object an object of class `"plm"`,
#' @param model on which transformation of the data the R-squared is to be
#' computed. If `NULL`, the transformation used to estimate the model is
#' also used for the computation of R squared,
#' @param type indicates method which is used to compute R squared. One of\cr
#' `"rss"` (residual sum of squares),\cr `"ess"` (explained sum of
#' squares), or\cr `"cor"` (coefficient of correlation between the fitted
#' values and the response),
#' @param dfcor if `TRUE`, the adjusted R squared is computed.
#' @return A numerical value. The R squared or adjusted R squared of the model
#' estimated on the transformed data, e. g., for the within model the so called
#' "within R squared".
#' @seealso [plm()] for estimation of various models;
#' [summary.plm()] which makes use of `r.squared`.
#' @keywords htest
#' @export
#' @examples
#' 
#' data("Grunfeld", package = "plm")
#' p <- plm(inv ~ value + capital, data = Grunfeld, model = "pooling")
#' r.squared(p)
#' r.squared(p, dfcor = TRUE)
#' 
r.squared <- function(object, model = NULL,
                      type = c("cor", "rss", "ess"), dfcor = FALSE){
  ## TODO: does not handle non-intercept models correctly
  ##       see below r.squared_no_intercept
  if (is.null(model)) model <- describe(object, "model")
  effect <- describe(object, "effect")
  type <- match.arg(type)
  if (type == "cor"){
    y <- pmodel.response(object, model = model, effect = effect)
    haty <- fitted(object, model = model, effect = effect)
    R2 <- cor(y, haty)^2
  }
  if (type == "rss"){
    R2 <- 1 - deviance(object, model = model) / tss(object, model = model)
  }
  if (type == "ess"){
    haty <- fitted(object, model = model)
    mhaty <- mean(haty)
    ess <- as.numeric(crossprod((haty - mhaty)))
    R2 <- ess / tss(object, model = model)
  }
  ### adj. R2 Still wrong for models without intercept, e.g., pooling models
  # (but could be correct for within models, see comment below in function r.squared_no_intercept)
  if (dfcor) R2 <- 1 - (1 - R2) * (length(resid(object)) - 1) / df.residual(object)
  R2
}

## first try at r.squared adapted to be suitable for non-intercept models
r.squared_no_intercept <- function(object, model = NULL,
                      type = c("rss", "ess", "cor"), dfcor = FALSE){
    if(is.null(model)) model <- describe(object, "model")
    effect <- describe(object, "effect")
    type <- match.arg(type)
    ## TODO: check what is sane for IV and what for within
    # [1L] as has.intercept returns > 1 boolean for IV models # TODO: to check if this is sane
    has.int <- if(model != "within") has.intercept(object)[1L] else FALSE
    
    if (type == "rss"){
      # approach: 1 - RSS / TSS
      R2 <- if(has.int) {
        1 - deviance(object, model = model) / tss(object, model = model)  
      } else {
        # use non-centered (= non-demeaned) TSS
        1 - deviance(object, model = model) / as.numeric(crossprod(pmodel.response(object, model = model)))
      }
    }
        
    if(type == "ess"){
      # approach: ESS / TSS
      haty <- fitted(object, model = model)
      R2 <- if(has.int) {
        mhaty <- mean(haty)
        ess <- as.numeric(crossprod(haty - mhaty))
        tss <- tss(object, model = model)
        ess / tss
      }
      else {
        # use non-centered (=non-demeaned) ESS and non-centered TSS
        ess <- as.numeric(crossprod(haty))
        tss <- as.numeric(crossprod(pmodel.response(object, model = model)))
        ess / tss
      }
    }
    
    if(type == "cor"){
      # approach: squared-correlation(dependent variable, predicted value), only for models with intercept
      if(!has.int) warning("for models without intercept, type = \"cor\" may not be sane") # TODO: tbd if warning is good
      
      # TODO: Check should this be for "cor" the original variable? This makes a difference for (at least) RE models!
      #       and on the fitted values which are not given by fitted() for RE models
#      y <- pmodel.response(object, model = model, effect = effect)
#      haty <- fitted(object, model = model, effect = effect)
      y <- pmodel.response(object, model = "pooling")
      haty <- fitted_exp.plm(object)
      R2 <- cor(y, haty)^2
    }
    
    # this takes care of the intercept
    # Still unclear, how the adjustment for within models should look like,
    # i.e., subtract 1 for intercept or not
    if(dfcor) R2 <- 1 - (1 - R2) * (length(resid(object)) - has.int) / df.residual(object)
    
    return(R2)
}

Try the plm package in your browser

Any scripts or data that you put into this service are public.

plm documentation built on June 22, 2024, 6:54 p.m.