R/fitstats.R

Defines functions proj_on_U mat_svd mat_sqrt cd_stat kp_stat .wald degrees_freedom_iid degrees_freedom r2 fitstat_validate wald fitstat fitstat_register print.fixest_fitstat

Documented in degrees_freedom degrees_freedom_iid fitstat fitstat_register print.fixest_fitstat r2 wald

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Mon Apr 19 17:23:31 2021
# ~: Various fit statistics
#----------------------------------------------#


####
#### stats -- User level ####
####



#' Print method for fit statistics of fixest estimations
#'
#' Displays a brief summary of selected fit statistics from the function [`fitstat`].
#'
#' @inherit fitstat examples
#'
#' @param x An object resulting from the [`fitstat`] function.
#' @param na.rm Logical, default is `FALSE`. If `TRUE`, the statistics that are missing 
#' are not displayed.
#' @param ... Not currently used.
#'
#'
print.fixest_fitstat = function(x, na.rm = FALSE, ...){

  dots = list(...)

  # below now replaced with sma
  # glue = function(x) gsub("( +)(,)", "\\2\\1", paste(x[nchar(x) > 0], collapse = ", "))
  # glue = function(x) paste(x[nchar(x) > 0], collapse = ", ")

  all_types = fitstat(give_types = TRUE)
  dict_type = all_types$R_alias

  if(na.rm){
    IS_NA = sapply(x, function(z) identical(z, NA))
    x = x[!IS_NA]
    if(length(x) == 0) return(invisible(NULL))
  }

  if(isTRUE(dots$group.solo)){
    # Later integration in print
    solo = lengths(x) == 1
    x_solo = unlist(x[solo])
    n_solo = length(x_solo)

    if(n_solo > 1){
      x_solo_names = names(x[solo])
      x_solo = unlist(x[solo])

      x = x[!solo]

      nr = ceiling(n_solo / 2)
      m = matrix(c(x_solo, NA)[1:(2*nr)], nr, 2, byrow = TRUE)
      m_alias = matrix(c(unlist(dict_type[x_solo_names]), "")[1:(2*nr)], nr, 2, byrow = TRUE)

      if(nr > 1 && anyNA(m[nr, ])){
        m[nr, ] = m[nr, 2:1]
        m_alias[nr, ] = m_alias[nr, 2:1]
      }

      cols_new = list()
      for(i in 1:2){
        col = m[,i]
        gt1 = abs(col) > 1 & !is.na(col)
        col_char = numberFormatNormal(col)
        col_char[gt1] = sfill(col_char[gt1], anchor = ".", na = "")
        col_char[!gt1] = sfill(col_char[!gt1], anchor = ".", na = "")
        col_char = sfill(col_char, right = TRUE)

        # The aliases
        col_char = paste0(sfill(m_alias[, i]), ": ", col_char)
        if(anyNA(col)){
          col_char[is.na(col)] = sfill("", nchar(col_char[1]))
        }

        cols_new[[i]] = col_char
      }
      mm = do.call(cbind, cols_new)

      cat(apply(mm, 1, paste, collapse = "   "), sep = "\n")
    }

    if(length(x) == 0) return(invisible(NULL))
  }

  res = c()

  # Formatting the stats and pvalues
  is_stat = sapply(x, function(z) "stat" %in% names(z))
  is_pval = sapply(x, function(z) "p" %in% names(z))

  if(any(is_stat)){
    stat_all = sapply(x[is_stat], function(z) z$stat)
    stat_all = paste0("stat = ", 
                      sfill(sfill(numberFormatNormal(stat_all), anchor = "."), right = TRUE))
    for(i in seq_along(stat_all)){
      j = which(is_stat)[i]
      x[[j]]$stat = stat_all[i]
    }
  }

  if(any(is_pval)){
    pval_all = sapply(x[is_pval], function(z) z$p)
    low = pval_all < 2.2e-16 & !is.na(pval_all)
    pval_all[low] = 2.2e-16
    pval_all = paste0("p ", ifelse(low, "< ", "= "), 
                      sfill(numberFormatNormal(pval_all), right = TRUE))

    for(i in seq_along(pval_all)){
      j = which(is_pval)[i]
      x[[j]]$p = pval_all[i]
    }
  }

  # formatting for multiple endo regs
  qui_right = rep(FALSE, length(x))

  same_as = function(x, y) isTRUE(all.equal(x, y, check.attributes = FALSE))

  for(i in seq_along(x)){
    type = names(x)[i]

    v = x[[i]]

    if(grepl("::", type, fixed = TRUE)){
      dict = getFixest_dict()
      rename_fun = function(x) paste0(dict_type[x[1]], ", ", 
                                      paste(sapply(x[-1], dict_apply, dict = dict), 
                                            collapse = "::"))
      test_name = rename_fun(strsplit(type, "::")[[1]])
      qui_right[i] = TRUE
    } else {
      if(type %in% names(dict_type)){
        test_name = dict_type[type]
      } else if(!grepl(".", type, fixed = TRUE)) {
        warning("Current type '", type, "' has no alias.")
        test_name = type
      } else {
        type_split = strsplit(type, ".", fixed = TRUE)[[1]]
        test_name = paste0(dict_type[type_split[1]], ", ", dict_type[type_split[2]])
      }

    }

    if(length(v) == 1){
      # Basic display
      res[i] = paste0(test_name, "! ", numberFormatNormal(v))

    } else if(type %in% all_types$user_types){
      # we can have extra values for user defined functions

      opts = getOption("fixest_fitstat_user")
      alias_subtypes = opts[[type]]$alias_subtypes

      arg_list = list()
      skip_df2 = FALSE
      for(k in seq_along(v)){
        subtype = names(v)[k]

        if(subtype == "df2" && skip_df2) next

        if(subtype == "stat"){
          arg_list[[k]] = v$stat

        } else if(subtype == "p"){
          arg_list[[k]] = v$p

        } else if(subtype == "df" && same_as(alias_subtypes["df"], "df")){
          arg_list[[k]] = paste0("on ", numberFormatNormal(v$df), " DoF")

        } else if(subtype == "df1" && "df2" %in% names(v) && same_as(alias_subtypes["df1"], "df1")){
          arg_list[[k]] = paste0("on ", numberFormatNormal(v$df1), " and ", 
                                 numberFormatNormal(v$df2), " DoF")
          skip_df2 = TRUE

        } else if(subtype == "vcov"){
          arg_list[[k]] = paste0(alias_subtypes[subtype], ": ", v$vcov)

        } else {
          arg_list[[k]] = paste0(alias_subtypes[subtype], " = ", numberFormatNormal(v[[subtype]]))

        }
      }

      # res[i] = paste0(test_name, "! ", glue(arg_list), ".")
      res[i] = sma("{test_name}! {rm, ', 'c ? arg_list}.")

    } else {

      stat_line = p_line = dof_line = vcov_line = ""
      if(!is.null(v$stat)) stat_line = v$stat
      if(!is.null(v$p)) p_line = v$p
      if(!is.null(v$df)) dof_line = paste0("on ", numberFormatNormal(v$df), " DoF")
      if(!is.null(v$df1)) dof_line = paste0("on ", numberFormatNormal(v$df1), " and ", numberFormatNormal(v$df2), " DoF")
      if(!is.null(v$vcov)) vcov_line = paste0("VCOV: ", v$vcov)

      # res[i] = paste0(test_name, "! ", glue(c(stat_line, p_line, dof_line, vcov_line)), ".")
      res[i] = sma("{test_name}! {rm, ', 'c ? c(stat_line, p_line, dof_line, vcov_line)}.")
    }
  }

  if(any(qui_right)){
    res2fix = strsplit(res[qui_right], "!", fixed = TRUE)
    left = sapply(res2fix, function(x) x[1])
    right = sapply(res2fix, function(x) x[2])
    res_fixed = paste0(sfill(left, right = TRUE), "!", right)
    res[qui_right] = res_fixed
  }

  cat(gsub("!", ":", sfill(res, anchor = "!"), fixed = TRUE), sep = "\n")

}



#' Register custom fit statistics
#'
#' Enables the registration of custom fi statistics that can be easily summoned with the function [`fitstat`].
#'
#' @inherit fitstat seealso
#'
#' @param type A character scalar giving the type-name.
#' @param fun A function to be applied to a `fixest` estimation. It must return either a scalar, 
#' or a list of unitary elements. If the number of elements returned is greater than 1, 
#' then each element must be named! If the fit statistic is not valid for a given estimation, 
#' a plain `NA` value should be returned.
#' @param alias A (named) character vector. An alias to be used in lieu of the type name in 
#' the display methods (ie when used in [`print.fixest_fitstat`] or [`etable`]). 
#' If the function returns several values, i.e. sub-types, you can give an alias to 
#' these sub-types. The syntax is `c("type" = "alias", "subtype_i" = "alias_i")`, 
#' with "type" (resp. "subtype") the value of the argument `type` resp. (`subtypes`). 
#' You can also give an alias encompassing the type and sub-type with the syntax 
#' `c("type.subtype_i" = "alias")`.
#' @param subtypes A character vector giving the name of each element returned by the 
#' function `fun`. This is only used when the function returns more than one value. 
#' Note that you can use the shortcut "test" when the sub-types are "stat", "p" and "df"; 
#' and "test2" when these are "stat", "p", "df1" and "df2".
#'
#' @details
#' If there are several components to the computed statistics (i.e. the function returns 
#' several elements), then using the argument `subtypes`, giving the names of each of 
#' these components, is mandatory. This is to ensure that the statistic can be used as any 
#' other built-in statistic (and there are too many edge cases impeding automatic deduction).
#'
#' @author Laurent Berge
#'
#' @examples
#'
#' # An estimation
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#' est = feols(y ~ x1 + x2 | species, base)
#'
#' #
#' # single valued tests
#' #
#'
#' # say you want to add the coefficient of variation of the dependent variable
#' cv = function(est){
#'   y = model.matrix(est, type = "lhs")
#'   sd(y)/mean(y)
#' }
#'
#' # Now we register the routine
#' fitstat_register("cvy", cv, "Coef. of Variation (dep. var.)")
#'
#' # now we can summon the registered routine with its type ("cvy")
#' fitstat(est, "cvy")
#'
#' #
#' # Multi valued tests
#' #
#'
#' # Let's say you want a Wald test with an heteroskedasticiy robust variance
#'
#' # First we create the function
#' hc_wald = function(est){
#'   w = wald(est, keep = "!Intercept", print = FALSE, se = "hetero")
#'   head(w, 4)
#' }
#' # This test returns a vector of 4 elements: stat, p, df1 and df2
#'
#' # Now we register the routine
#' fitstat_register("hc_wald", hc_wald, "Wald (HC1)", "test2")
#'
#' # You can access the statistic, as before
#' fitstat(est, "hc_wald")
#'
#' # But you can also access the sub elements
#' fitstat(est, "hc_wald.p")
#'
fitstat_register = function(type, fun, alias = NULL, subtypes = NULL){

  check_arg(type, "character scalar mbt")
  check_arg(fun, "function mbt")
  check_arg(alias, "NULL character vector no na")
  check_arg(subtypes, "NULL character vector no na")

  # We check the type is not conflicting
  existing_types = fitstat(give_types = TRUE)$types

  opts = getOption("fixest_fitstat_user")

  if(type %in% setdiff(existing_types, names(opts))){
    stopi("The type name {bq?type} is the same as one built-in type. Please choose another one.")
  }

  # Alias:
  # - type_name = alias_name          => only the type
  # - subtype_name = alias_name       => only the subtype
  # - type.subtype_name = alias_name  => full type name

  # if alias is a character vector WITHOUT name, then only the type is named

  if(identical(subtypes, "test")){
    subtypes = c("stat", "p", "df")
  } else if(identical(subtypes, "test2")){
    subtypes = c("stat", "p", "df1", "df2")
  }

  IS_SUB = !is.null(subtypes)

  if(missnull(alias)){
    alias = type
    names(alias) = type

  } else {
    # Handling all the cases is a bit of a pain

    if(is.null(names(alias))){

      if(length(alias) == 1){
        tmp = alias
        names(tmp) = type
        alias = tmp

      } else if(length(alias) == (1 + length(subtypes))){
        # Implicit naming
        tmp = alias
        names(tmp) = c(type, subtypes)
        alias = tmp

      } else {
        # We have a problem here, we can't infer implicitly, too arbitrary
        if(IS_SUB){
          check_value(alias, "character scalar", 
                      .message = "The alias should be a character scalar.")
        } else {
          stopi("To define the aliases, please use a named character vector, ",
                "with the names being the codes, and the values the aliases. \n",
                "E.g.: alias = c({Q?type} = \"alias_1\", {Q?subtypes[1]} = \"alias_2\").")
        }
      }

    } else {
      alias_names = names(alias)

      is_0 = which(nchar(alias_names) == 0)
      if(length(is_0) > 0){
        # checking the problems
        if(length(is_0) > 1){
          stop("In the argument 'alias': only the main type can be implicitly set and should come first (they are several implicitly defined atm). Please explicitly use names for each type/subtype you want.")

        } else if(is_0 != 1){
          stop("In the argument 'alias': only the main type can be implicitly set and should come first (it is currenly not first). Please explicitly use names for each type/subtype you want.")

        }

        names(alias)[1] = type
        alias_names = names(alias)
      }

      # We check that the names are correct
      possible_types = type
      if(IS_SUB){
        possible_types = c(possible_types, subtypes, paste0(type, ".", subtypes))
      }

      check_set_value(alias_names, "multi match", 
                      .choices = possible_types, 
                      .message = "In argument 'alias', the names should be equal to the type, the subtypes, or of the form 'type.subtype'.")
      names(alias) = alias_names
    }
  }

  # We create the full list of aliases
  alias_subtypes = NULL
  if(IS_SUB){

    # subtype aliases => I add some default values if not provided

    default_sub_aliases = setNames(subtypes, subtypes)
    default_sub_aliases[c("stat", "p")] = c("stat.", "p-value")

    # Note that the names are not kept when NA.. ||
    alias_subtypes = alias[subtypes] #           ||
    qui_NA = is.na(alias_subtypes)   #           ||
    alias_subtypes = alias_subtypes[!qui_NA] #   | = > I need to clean them

    alias_subtypes[subtypes[qui_NA]] = default_sub_aliases[subtypes[qui_NA]]

    # Main aliases

    if(type %in% names(alias)){
      alias_main = alias[type]
    } else  {
      alias_main = setNames(type, type)
    }

    # The full types
    full_types = paste0(type, ".", subtypes)
    alias_full = alias[full_types]
    qui_NA = is.na(alias_full)
    alias_full = alias_full[!qui_NA]

    alias_full[full_types[qui_NA]] = paste0(alias_main, ", ", alias_subtypes[subtypes[qui_NA]])

    alias_main = c(alias_main, alias_full)

  } else {
    alias_main = alias
  }

  res = list(fun = fun, alias = alias_main, alias_subtypes = alias_subtypes)

  opts[[type]] = res

  options(fixest_fitstat_user = opts)

  invisible(NULL)
}

#' Computes fit statistics of fixest objects
#'
#' Computes various fit statistics for `fixest` estimations.
#'
#' @param x A `fixest` estimation.
#' @param type Character vector or one sided formula. The type of fit statistic to be computed. 
#' The classic ones are: n, rmse, r2, pr2, f, wald, ivf, ivwald. You have the full list in 
#' the details section or use `show_types = TRUE`. Further, you can register your own types 
#' with [`fitstat_register`].
#' @param simplify Logical, default is `FALSE`. By default a list is returned whose names are 
#' the selected types. If `simplify = TRUE` and only one type is selected, then the element 
#' is directly returned (ie will not be nested in a list).
#' @param verbose Logical, default is `TRUE`. If `TRUE`, an object of class `fixest_fitstat` 
#' is returned (so its associated print method will be triggered). If `FALSE` a simple list 
#' is returned instead.
#' @param show_types Logical, default is `FALSE`. If `TRUE`, only prompts all available types.
#' @param frame An environment in which to evaluate variables, default is `parent.frame()`. 
#' Only used if the argument `type` is a formula and some values in the formula have to be 
#' extended with the dot square bracket operator. Mostly for internal use.
#' @param ... Other elements to be passed to other methods and may be used to compute 
#' the statistics (for example you can pass on arguments to compute the VCOV when using 
#' `type = "g"` or `type = "wald"`.).
#'
#' @section Registering your own types:
#'
#' You can register custom fit statistics with the function `fitstat_register`.
#'
#' @section Available types:
#'
#' The types are case sensitive, please use lower case only. The types available are:
#'
#' \describe{
#' \item{`n`, `ll`, `aic`, `bic`, `rmse`: }{The number of observations, the log-likelihood, 
#' the AIC, the BIC and the root mean squared error, respectively.}
#' \item{`my`: }{Mean of the dependent variable.}
#' \item{`g`: }{The degrees of freedom used to compute the t-test (it influences the p-values 
#' of the coefficients). When the VCOV is clustered, this value is equal to the minimum 
#' cluster size, otherwise, it is equal to the sample size minus the number of variables.}
#' \item{`r2`, `ar2`, `wr2`, `awr2`, `pr2`, `apr2`, `wpr2`, `awpr2`: }{All r2 that can be 
#' obtained with the function [`r2`]. The `a` stands for 'adjusted', the `w` for 'within' and 
#' the `p` for 'pseudo'. Note that the order of the letters `a`, `w` and `p` does not matter. 
#' The pseudo R2s are McFadden's R2s (ratios of log-likelihoods).}
#' \item{`theta`: }{The over-dispersion parameter in Negative Binomial models. Low values mean 
#' high overdispersion.}
#' \item{`f`, `wf`: }{The F-tests of nullity of the coefficients. The `w` stands for 
#' 'within'. These types return the following values: `stat`, `p`, `df1` and `df2`. 
#' If you want to display only one of these, use their name after a dot: e.g. `f.stat` 
#' will give the statistic of the F-test, or `wf.p` will give the p-values of the F-test 
#' on the projected model (i.e. projected onto the fixed-effects).}
#' \item{`wald`: }{Wald test of joint nullity of the coefficients. This test always excludes 
#' the intercept and the fixed-effects. These type returns the following values: 
#' `stat`, `p`, `df1`, `df2` and `vcov`. The element `vcov` reports the way the VCOV 
#' matrix was computed since it directly influences this statistic.}
#' \item{`ivf`, `ivf1`, `ivf2`, `ivfall`: }{These statistics are specific to IV estimations. 
#' They report either the IV F-test (namely the Cragg-Donald F statistic in the presence 
#' of only one endogenous regressor) of the first stage (`ivf` or `ivf1`), of the 
#' second stage (`ivf2`) or of both (`ivfall`). The F-test of the first stage is 
#' commonly named weak instrument test. The value of `ivfall` is only useful in [`etable`] 
#' when both the 1st and 2nd stages are displayed (it leads to the 1st stage F-test(s) 
#' to be displayed on the 1st stage estimation(s), and the 2nd stage one on the 
#' 2nd stage estimation -- otherwise, `ivf1` would also be displayed on the 2nd stage 
#' estimation). These types return the following values: `stat`, `p`, `df1` and `df2`.}
#' \item{`ivwald`, `ivwald1`, `ivwald2`, `ivwaldall`: }{These statistics are specific to IV 
#' estimations. They report either the IV Wald-test of the first stage (`ivwald` or `ivwald1`), 
#' of the second stage (`ivwald2`) or of both (`ivwaldall`). The Wald-test of the first stage 
#' is commonly named weak instrument test. Note that if the estimation was done with a robust 
#' VCOV and there is only one endogenous regressor, this is equivalent to the 
#' Kleibergen-Paap statistic. The value of `ivwaldall` is only useful in [`etable`] when both 
#' the 1st and 2nd stages are displayed (it leads to the 1st stage Wald-test(s) to be displayed 
#' on the 1st stage estimation(s), and the 2nd stage one on the 2nd stage estimation -- 
#' otherwise, `ivwald1` would also be displayed on the 2nd stage estimation). These types 
#' return the following values: `stat`, `p`, `df1`, `df2`, and `vcov`.}
#' \item{`cd`: }{The Cragg-Donald test for weak instruments.}
#' \item{`kpr`: }{The Kleibergen-Paap test for weak instruments.}
#' \item{`wh`: }{This statistic is specific to IV estimations. Wu-Hausman endogeneity test. 
#' H0 is the absence of endogeneity of the instrumented variables. It returns the following 
#' values: `stat`, `p`, `df1`, `df2`.}
#' \item{`sargan`: }{Sargan test of overidentifying restrictions. H0: the instruments are 
#' not correlated with the second stage residuals. It returns the 
#' following values: `stat`, `p`, `df`.}
#' \item{`lr`, `wlr`: }{Likelihood ratio and within likelihood ratio tests. It returns 
#' the following elements: `stat`, `p`, `df`. Concerning the within-LR test, note that, 
#' contrary to estimations with `femlm` or `feNmlm`, estimations with `feglm`/`fepois` 
#' need to estimate the model with fixed-effects only which may prove time-consuming 
#' (depending on your model). Bottom line, if you really need the within-LR and estimate a 
#' Poisson model, use `femlm` instead of `fepois` (the former uses direct ML maximization for 
#' which the only FEs model is a by product).}
#' }
#'
#'
#'
#'
#' @return
#' By default an object of class `fixest_fitstat` is returned. Using `verbose = FALSE` 
#' returns a simple a list. Finally, if only one type is selected, `simplify = TRUE` 
#' leads to the selected type to be returned.
#'
#' @examples
#'
#' data(trade)
#' gravity = feols(log(Euros) ~ log(dist_km) | Destination + Origin, trade)
#'
#' # Extracting the 'working' number of observations used to compute the pvalues
#' fitstat(gravity, "g", simplify = TRUE)
#'
#' # Some fit statistics
#' fitstat(gravity, ~ rmse + r2 + wald + wf)
#'
#' # You can use them in etable
#' etable(gravity, fitstat = ~ rmse + r2 + wald + wf)
#'
#' # For wald and wf, you could show the pvalue instead:
#' etable(gravity, fitstat = ~ rmse + r2 + wald.p + wf.p)
#'
#' # Now let's display some statistics that are not built-in
#' # => we use fitstat_register to create them
#'
#' # We need: a) type name, b) the function to be applied
#' #          c) (optional) an alias
#'
#' fitstat_register("tstand", function(x) tstat(x, se = "stand")[1], "t-stat (regular)")
#' fitstat_register("thc", function(x) tstat(x, se = "heter")[1], "t-stat (HC1)")
#' fitstat_register("t1w", function(x) tstat(x, se = "clus")[1], "t-stat (clustered)")
#' fitstat_register("t2w", function(x) tstat(x, se = "twow")[1], "t-stat (2-way)")
#'
#' # Now we can use these keywords in fitstat:
#' etable(gravity, fitstat = ~ . + tstand + thc + t1w + t2w)
#'
#' # Note that the custom stats we created are can easily lead
#' # to errors, but that's another story!
#'
#'
fitstat = function(x, type, simplify = FALSE, verbose = TRUE, show_types = FALSE,
           frame = parent.frame(), ...){

  r2_types = c("sq.cor", "cor2", "r2", "ar2", "pr2", "apr2", "par2", "wr2",
               "war2", "awr2", "wpr2", "pwr2", "wapr2", "wpar2", "awpr2",
               "apwr2", "pawr2", "pwar2")

  opts = getOption("fixest_fitstat_user")
  user_types = names(opts)

  dots = list(...)

  if(isTRUE(dots$give_types) || isTRUE(show_types)){

    # Compound types => types yielding several values

    # F-test etc
    comp_types = c("f", "wf", "ivf", "ivf1", "ivf2", "ivfall", "wald", "ivwald",
                   "ivwald1", "ivwald2", "ivwaldall", "wh", "sargan", "lr",
                   "wlr", "kpr", "cd")

    full_comp_types = paste(comp_types, rep(c("stat", "p"), each = length(comp_types)), sep = ".")

    comp_alias = c(f = "F-test", wf = "F-test (projected)", ivfall = "F-test (IV only)", 
                   ivf1 = "F-test (1st stage)", ivf2 = "F-test (2nd stage)", 
                   wald = "Wald (joint nullity)", ivwaldall = "Wald (IV only)", 
                   ivwald1 = "Wald (1st stage)", ivwald2 = "Wald (2nd stage)", 
                   wh = "Wu-Hausman", sargan = "Sargan", lr = "LR", wlr = "LR (within)", 
                   df1 = "DoF (first)", df2 = "DoF (second)", df = "DoF", 
                   kpr = "Kleibergen-Paap", cd = "Cragg-Donald")
    
    my_names = paste(names(comp_alias), rep(c("stat", "p"), each = length(comp_alias)), sep = ".")
    full_comp_alias = setNames(paste0(comp_alias, ", ", 
                                      rep(c("stat.", "p-value"), 
                                          each = length(comp_alias))), my_names)


    # Having some trouble with aggregation post lapply due to names....
    # I have a vectorized solution but it's really ugly => loop is clearer
    user_alias = c()
    for(i in seq_along(opts)){
      user_alias = c(user_alias, opts[[i]]$alias)
    }

    user_types = names(user_alias)

    # "Regular" types
    valid_types = c("n", "ll", "aic", "bic", "rmse", "g", "my", "theta", r2_types, 
                    comp_types, full_comp_types, user_types)

    tex_alias = c(n = "Observations", ll = "Log-Likelihood", aic = "AIC", bic = "BIC", 
                  my = "Dependent variable mean", g = "Size of the 'effective' sample", 
                  rmse = "RMSE", theta = "Over-dispersion", sq.cor = "Squared Correlation", 
                  cor2 = "Squared Correlation", r2 = "R$^2$", ar2 = "Adjusted R$^2$", 
                  pr2 = "Pseudo R$^2$", apr2 = "Adjusted Pseudo R$^2$", 
                  wr2 = "Within R$^2$", war2 = "Within Adjusted R$^2$", 
                  wpr2 = "Within Pseudo R$^2$", wapr2 = "Whithin Adjusted Pseudo R$^2$", 
                  comp_alias, full_comp_alias, user_alias)

    R_alias = c(n = "Observations", ll = "Log-Likelihood", aic = "AIC", bic = "BIC", 
                my = "Dep. Var. mean", g = "G", rmse = "RMSE", theta = "Over-dispersion", 
                sq.cor = "Squared Cor.", cor2 = "Squared Cor.", r2 = "R2", ar2 = "Adj. R2", 
                pr2 = "Pseudo R2", apr2 = "Adj. Pseudo R2", wr2 = "Within R2", 
                war2 = "Within Adj. R2", wpr2 = "Within Pseudo R2", 
                wapr2 = "Whithin Adj. Pseudo R2", comp_alias, full_comp_alias, user_alias)

    # add r2 type alias
    type_alias = c(ivf = "ivf1", ivf.stat = "ivf1.stat", ivf.p = "ivf1.p", ivwald = "ivwald1", 
                   ivwald.stat = "ivwald1", ivwald.p = "ivwald1.p", par2 = "apr2", 
                   awr2 = "war2", pwr2 = "wpr2", wpar2 = "wapr2", pwar2 = "wapr2", 
                   pawr2 = "wapr2", apwr2 = "wapr2", awpr2 = "wapr2", sq.cor = "cor2")

    if(show_types){
      cat("Available types:\n", paste(valid_types, collapse = ", "), "\n")
      return(invisible(NULL))
    }

    res = list(types = valid_types, tex_alias = tex_alias, R_alias = R_alias, 
               type_alias = type_alias, user_types = user_types)
    return(res)
  }

  check_arg(x, "class(fixest) mbt")
  check_set_arg(type, "character vector no na | os formula")
  check_arg(simplify, verbose, "logical scalar")

  if("formula" %in% class(type)){
    type = .xpd(type, frame = frame)

    type = gsub(" ", "", strsplit(deparse_long(type[[2]]), "+", fixed = TRUE)[[1]])
  }
  type = tolower(type)

  # To update
  if(!isTRUE(x$summary) || any(c("se", "cluster", "ssc") %in% names(dots))){
    x = summary(x, ...)
  }

  IS_ETABLE = isTRUE(dots$etable)
  set_value = function(vec, value){
    if(length(vec) == 1) return(vec)

    if(value != ""){

      if(!value %in% names(vec)){
        stop_up("'", value, "' is not a valid component of the statistic '", root, "'. Only the following are valid: ", enumerate_items(names(vec), "quote"), ".")
      }

      return(vec[[value]])
    } else if(IS_ETABLE){
      return(vec[1])
    } else {
      return(vec)
    }
  }

  res_all = list()
  type_all = type

  for(i in seq_along(type_all)){
    type = type_all[i]

    # Big if
    if(type == "n"){
      res_all[[type]] = x$nobs

    } else if(type == "ll"){
      res_all[[type]] = logLik(x)

    } else if(type == "aic"){
      res_all[[type]] = AIC(x)

    } else if(type == "bic"){
      res_all[[type]] = BIC(x)

    } else if(type == "rmse"){
      res_all[[type]] = if(!is.null(x$ssr)) sqrt(x$ssr / x$nobs) else sqrt(cpp_ssq(resid(x)) / x$nobs)

    } else if(type == "g"){
      my_vcov = x$cov.scaled

      G = attr(my_vcov, "G")
      if(is.null(G)) G = x$nobs - x$nparams

      res_all[[type]] = G

    } else if(type == "my"){
      res_all[[type]] = mean(model.matrix(x, type = "lhs"))

    } else if(type == "theta"){
      isNegbin = x$method == "fenegbin" || (x$method_type == "feNmlm" && x$family == "negbin")
      if(isNegbin){
        theta = x$coefficients[".theta"]
        names(theta) = "Overdispersion"
      } else {
        theta = NA
      }

      res_all[[type]] = theta

    } else if(type %in% r2_types){
      res_all[[type]] = r2(x, type)

    } else {

      #
      # Types for which root.value is allowed
      #

      if(grepl(".", type, fixed = TRUE)){
        root = gsub("\\..+", "", type)
        value = gsub(".+\\.", "", type)
      } else {
        root = type
        value = ""
      }

      # We need to normalize the types
      from_to = c("ivwald" = "ivwald1", "ivf" = "ivf1")
      if(root %in% names(from_to)){
        type = gsub(root, from_to[root], type, fixed = TRUE)
        root = from_to[root]
      }

      if(root == "f"){
        if(!is.null(x$ssr)){
          df1 = degrees_freedom(x, "k") - 1
          df2 = degrees_freedom(x, "t")

          if(isTRUE(x$iv) && x$iv_stage == 2){
            # We need to compute the SSR
            w = 1
            if(!is.null(x$weights)) w = x$weights
            ssr = cpp_ssq(x$iv_residuals, w)
          } else {
            ssr = x$ssr
          }

          stat = ((x$ssr_null - ssr) / df1) / (ssr / df2)
          p = pf(stat, df1, df2, lower.tail = FALSE)
          vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }
      } else if(root == "wf"){
        df1 = length(x$coefficients)

        if(!is.null(x$ssr_fe_only) && df1 > 0){
          df2 = x$nobs - x$nparams
          stat = ((x$ssr_fe_only - x$ssr) / df1) / (x$ssr / df2)
          p = pf(stat, df1, df2, lower.tail = FALSE)
          vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }


      } else if(root == "ivfall"){
        if(isTRUE(x$iv)){
          if(x$iv_stage == 1){
            df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd)
            df2 = degrees_freedom(x, "resid")

            stat = ((x$ssr_no_inst - x$ssr) / df1) / (x$ssr / df2)
            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
            res_all[[type]] = set_value(vec, value)

          } else {
            # f stat for the second stage

            df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
            df2 = degrees_freedom(x, "resid")

            w = 1
            if(!is.null(x$weights)) w = x$weights
            ssr = cpp_ssq(x$iv_residuals, w)

            stat = ((x$ssr_no_endo - ssr) / df1) / (ssr / df2)
            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
            res_all[[type]] = set_value(vec, value)
          }

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "ivf1"){

        if(isTRUE(x$iv)){
          df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd, stage = 1)
          df2 = degrees_freedom(x, "resid", stage = 1)

          if(x$iv_stage == 1){

            stat = ((x$ssr_no_inst - x$ssr) / df1) / (x$ssr / df2)
            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
            res_all[[type]] = set_value(vec, value)

          } else {
            x_first = x$iv_first_stage

            for(endo in names(x_first)){
              stat = ((x_first[[endo]]$ssr_no_inst - x_first[[endo]]$ssr) / df1) / (x_first[[endo]]$ssr / df2)
              p = pf(stat, df1, df2, lower.tail = FALSE)
              vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
              res_all[[paste0(type, "::", endo)]] = set_value(vec, value)
            }
          }

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "ivf2"){
        if(isTRUE(x$iv) && x$iv_stage == 2){
          # f stat for the second stage

          df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
          df2 = degrees_freedom(x, "resid")

          w = 1
          if(!is.null(x$weights)) w = x$weights
          ssr = cpp_ssq(x$iv_residuals, w)

          stat = ((x$ssr_no_endo - ssr) / df1) / (ssr / df2)
          p = pf(stat, df1, df2, lower.tail = FALSE)
          vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "kpr"){
        if(isTRUE(x$iv) && x$iv_stage == 2){
          # The KP rank test is computed in a specific function

          vec = kp_stat(x)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "cd"){
        if(isTRUE(x$iv) && x$iv_stage == 2){
          # The KP rank test is computed in a specific function

          vec = cd_stat(x)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "wald"){
        # Joint nullity of the coefficients
        # if FE => on the projected model only

        qui = !names(x$coefficients) %in% "(Intercept)"
        my_coef = x$coefficients[qui]
        df1 = length(my_coef)

        if(df1 > 0){
          df2 = degrees_freedom(x, "resid")

          # The VCOV is always full rank in here
          stat = .wald(x, names(my_coef))
          p = pf(stat, df1, df2, lower.tail = FALSE)
          vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
          res_all[[type]] = set_value(vec, value)

        } else {
          # Only fixef
          res_all[[type]] = NA
        }

      } else if(root == "ivwaldall"){

        if(isTRUE(x$iv)){
          if(x$iv_stage == 1){

            df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd)
            df2 = degrees_freedom(x, "resid")

            stat = .wald(x, x$iv_inst_names_xpd)

            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
            res_all[[type]] = set_value(vec, value)

          } else {
            # wald stat for the second stage

            df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
            df2 = degrees_freedom(x, "resid")

            stat = .wald(x, x$iv_endo_names_fit)

            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
            res_all[[type]] = set_value(vec, value)
          }

        } else {
          res_all[[type]] = NA
        }

      } else if(root %in% "ivwald1"){

        if(isTRUE(x$iv)){
          df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd, stage = 1)
          df2 = degrees_freedom(x, "resid", stage = 1)


          if(x$iv_stage == 1){

            stat = .wald(x, x$iv_inst_names_xpd)

            p = pf(stat, df1, df2, lower.tail = FALSE)
            vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
            res_all[[type]] = set_value(vec, value)

          } else {
            x_first = x$iv_first_stage
            inst = x$iv_inst_names_xpd

            for(endo in names(x_first)){

              my_x_first = x_first[[endo]]

              if(is.null(my_x_first$cov.scaled)){
                # We compute the VCOV like for the second stage
                flags = x$summary_flags
                if(is.null(flags) || !is.null(dots$se) || !is.null(dots$cluster)){
                  my_x_first = summary(my_x_first, ...)
                } else {
                  my_x_first = summary(my_x_first, vcov = flags$vcov, ssc = flags$ssc, ...)
                }
              }

              stat = .wald(my_x_first, inst)

              p = pf(stat, df1, df2, lower.tail = FALSE)
              vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
              res_all[[paste0(type, "::", endo)]] = set_value(vec, value)
            }
          }

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "ivwald2"){
        if(isTRUE(x$iv) && x$iv_stage == 2){
          # wald stat for the second stage

          df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
          df2 = degrees_freedom(x, "resid")

          stat = .wald(x, x$iv_endo_names_fit)

          p = pf(stat, df1, df2, lower.tail = FALSE)
          vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "wh"){
        if(isTRUE(x$iv) && x$iv_stage == 2){
          # Wu Hausman stat for the second stage
          vec = x$iv_wh
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "sargan"){
        if(!is.null(x$iv_sargan)){
          # Wu Hausman stat for the second stage
          vec = x$iv_sargan
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "lr"){
        if(!is.null(x$ll_null)){

          stat = 2 * (x$loglik - x$ll_null)
          df = x$nparams
          p = pchisq(stat, df, lower.tail = FALSE)

          vec = list(stat = stat, p = p, df = df)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root == "wlr"){
        if(!is.null(x$ll_fe_only) || (x$method_type == "feglm" && !is.null(x$fixef_id))){

          if(x$method_type == "feglm"){
            # estimation of the FE only model

            newdata = cbind(data.frame(y = x$y), as.data.frame(x$fixef_id))
            if(!is.null(x$fixef_terms)){
              newdata = cbind(newdata, as.data.frame(x$slope_variables))
            }
            new_fml = merge_fml(y ~ 1, x$fml_all$fixef)
            res_fe = feglm(fml = new_fml, data = newdata, glm.tol = 1e-2, fixef.tol = 1e-3, family = x$family$family, weights = x$weights, offset = x$offset)

            ll_fe_only = logLik(res_fe)
          } else {
            ll_fe_only = x$ll_fe_only
          }

          stat = 2 * (x$loglik - ll_fe_only)
          df = length(x$coefficients)
          p = pchisq(stat, df, lower.tail = FALSE)

          vec = list(stat = stat, p = p, df = df)
          res_all[[type]] = set_value(vec, value)

        } else {
          res_all[[type]] = NA
        }

      } else if(root %in% user_types){

        res_all[[type]] = set_value(opts[[root]]$fun(x), value)

      } else {
        stop("The type '", type, "' is not supported, see details of ?fitstat, or use fitstat(show_types = TRUE) to get the names of all supported tests.")
      }
    }

  }

  if(simplify){
    verbose = FALSE
  }

  if(verbose){
    class(res_all) = "fixest_fitstat"
  }

  if(length(type_all) == 1 && simplify){
    return(res_all[[1]])
  }

  res_all
}


#' Wald test of nullity of coefficients
#'
#' Wald test used to test the joint nullity of a set of coefficients.
#'
#' @inheritParams print.fixest
#' @inheritParams summary.fixest
#' @inheritParams etable
#'
#' @param print Logical, default is `TRUE`. If `TRUE`, then a verbose description of the test 
#' is prompted on the R console. Otherwise only a named vector containing the test statistics 
#' is returned.
#' @param ... Any other element to be passed to [`summary.fixest`].
#'
#' @details
#' The type of VCOV matrix plays a crucial role in this test. Use the arguments `se` and 
#' `cluster` to change the type of VCOV for the test.
#'
#' @return
#' A named vector containing the following elements is returned: `stat`, `p`, `df1`, 
#' and `df2`. They correspond to the test statistic, the p-value, the first and 
#' second degrees of freedoms.
#'
#' If no valid coefficient is found, the value `NA` is returned.
#'
#' @examples
#'
#' data(airquality)
#'
#' est = feols(Ozone ~ Solar.R + Wind + poly(Temp, 3), airquality)
#'
#' # Testing the joint nullity of the Temp polynomial
#' wald(est, "poly")
#'
#' # Same but with clustered SEs
#' wald(est, "poly", cluster = "Month")
#'
#' # Now: all vars but the polynomial and the intercept
#' wald(est, drop = "Inte|poly")
#'
#' #
#' # Toy example: testing pre-trends
#' #
#'
#' data(base_did)
#'
#' est_did = feols(y ~ x1 + i(period, treat, 5) | id + period, base_did)
#'
#' # The graph of the coefficients
#' coefplot(est_did)
#'
#' # The pre-trend test
#' wald(est_did, "period::[1234]$")
#'
#' # If "period::[1234]$" looks weird to you, check out
#' # regular expressions: e.g. see ?regex.
#' # Learn it, you won't regret it!
#'
#'
wald = function(x, keep = NULL, drop = NULL, print = TRUE, vcov, se, cluster, ...){
  # LATER:
  # - keep can be a list
  #   * list("fit_" = 1, "x5$")
  #   * regex = restriction. No "=" => 0

  check_arg(x, "class(fixest)")
  check_arg(keep, drop, "NULL character vector no na")

  if(isTRUE(x$onlyFixef)) return(NA)

  dots = list(...)
  if(!isTRUE(x$summary)|| !missing(vcov) || !missing(se) || !missing(cluster) || ...length() > 0){
    x = summary(x, vcov = vcov, se = se, cluster = cluster, ...)
  }

  if(missing(keep) && missing(drop)){
    drop = "\\(Intercept\\)"
  }

  coef_name = names(x$coefficients)
  coef_name = keep_apply(coef_name, keep)
  coef_name = drop_apply(coef_name, drop)

  if(length(coef_name) == 0) return(NA)

  qui = names(x$coefficients) %in% coef_name
  my_coef = x$coefficients[qui]
  df1 = length(my_coef)

  df2 = x$nobs - x$nparams

  # The VCOV is always full rank in here
  stat = drop(my_coef %*% solve(x$cov.scaled[qui, qui]) %*% my_coef) / df1
  p = pf(stat, df1, df2, lower.tail = FALSE)
  vcov = attr(x$cov.scaled, "type")
  vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = vcov)

  if(print){
    cat("Wald test, H0: ", ifsingle(coef_name, "", "joint "), "nullity of ", enumerate_items(coef_name), "\n", sep  ="")
    cat(" stat = ", numberFormatNormal(stat),
      ", p-value ", ifelse(p < 2.2e-16, "< 2.2e-16", paste0("= ", numberFormatNormal(p))),
      ", on ", numberFormatNormal(df1), " and ", numberFormatNormal(df2), " DoF, ",
      "VCOV: ", vcov, ".", sep = "")

    return(invisible(vec))
  } else {
    return(vec)
  }
}

fitstat_validate = function(x, vector = FALSE){
  check_value(x, "NA | os formula | charin(FALSE) | character vector no na", .arg_name = "fitstat", .up = 1)

  if("formula" %in% class(x)){
    x = attr(terms(update(~ DEFAULT, x)), "term.labels")
  } else if (length(x) == 1 && (isFALSE(x) || is.na(x))){
    x = c()
  }

  # checking the types
  fitstat_fun_types = fitstat(give_types = TRUE)
  fitstat_type_allowed = fitstat_fun_types$types
  x = unique(x)
  type_alias = fitstat_fun_types$type_alias

  if(any(x %in% names(type_alias))){
    i = intersect(x, names(type_alias))
    x[x %in% i] = type_alias[x[x %in% i]]
  }

  pblm = setdiff(x, c(fitstat_type_allowed, "DEFAULT"))
  if(length(pblm) > 0){
    stop_up("In fitstat, argument `x` must be a one sided formula (or a character vector) containing valid types from the function fitstat (see details in ?fitstat or use fitstat(`show_types = TRUE`)). The type{$s, enum.q, is?pblm} not valid.")
  }

  if(length(x) == 0){
    x = NA
  } else if(vector){
    x = gsub("DEFAULT", ".", x, fixed = TRUE)
  } else {
    x = as.formula(paste("~", paste(gsub("DEFAULT", ".", x), collapse = "+")))
  }

  x
}


#' R2s of `fixest` models
#'
#' Reports different R2s for `fixest` estimations (e.g. [`feglm`] or [`feols`]).
#'
#' @param x A `fixest` object, e.g. obtained with function [`feglm`] or [`feols`].
#' @param type A character vector representing the R2 to compute. The R2 codes are of the form: 
#' "wapr2" with letters "w" (within), "a" (adjusted) and "p" (pseudo) possibly missing. 
#' E.g. to get the regular R2: use `type = "r2"`, the within adjusted R2: use `type = "war2"`, 
#' the pseudo R2: use `type = "pr2"`, etc. Use `"cor2"` for the squared correlation. 
#' By default, all R2s are computed.
#' @param full_names Logical scalar, default is `FALSE`. If `TRUE` then names of the vector 
#' in output will have full names instead of keywords (e.g. `Squared Correlation` 
#' instead of `cor2`, etc).
#'
#' @details
#'
#' The pseudo R2s are the McFaddens R2s, that is the ratio of log-likelihoods.
#'
#' For R2s with no theoretical justification, like e.g. regular R2s for maximum likelihood 
#' models -- or within R2s for models without fixed-effects, NA is returned. 
#' The single measure to possibly compare all kinds of models is the squared 
#' correlation between the dependent variable and the expected predictor.
#'
#' The pseudo-R2 is also returned in the OLS case, it corresponds to the 
#' pseudo-R2 of the equivalent GLM model with a Gaussian family.
#'
#' For the adjusted within-R2s, the adjustment factor is `(n - nb_fe) / (n - nb_fe - K)` 
#' with `n` the number of observations, `nb_fe` the number of fixed-effects and `K` 
#' the number of variables.
#'
#' @return
#' Returns a named vector.
#'
#' @author
#' Laurent Berge
#'
#' @examples
#'
#' # Load trade data
#' data(trade)
#'
#' # We estimate the effect of distance on trade (with 3 fixed-effects)
#' est = feols(log(Euros) ~ log(dist_km) | Origin + Destination + Product, trade)
#'
#' # Squared correlation:
#' r2(est, "cor2")
#'
#' # "regular" r2:
#' r2(est, "r2")
#'
#' # pseudo r2 (equivalent to GLM with Gaussian family)
#' r2(est, "pr2")
#'
#' # adjusted within r2
#' r2(est, "war2")
#'
#' # all four at once
#' r2(est, c("cor2", "r2", "pr2", "war2"))
#'
#' # same with full names instead of codes
#' r2(est, c("cor2", "r2", "pr2", "war2"), full_names = TRUE)
#'
r2 = function(x, type = "all", full_names = FALSE){
  # p: pseudo
  # w: within
  # a: adjusted

  check_arg(full_names, "logical scalar")

  if(!"fixest" %in% class(x)){
    stop("Only 'fixest' objects are supported.")
  }

  check_arg(type, "character vector no na", 
            .message = "Argument 'type' must be a character vector (e.g. type = c(\"cor2\", \"r2\", \"pr2\")). (a: adjused, p: pseudo, w: within.)")

  # type_allowed next => ("count", "acount") ?
  dict_names = c("cor2" = "Squared Correlation", "r2" = "R2", "ar2" = "Adjusted R2", 
                 "pr2" = "Pseudo R2", "apr2" = "Adjusted Pseudo R2", "wr2" = "Within R2", 
                 "war2" = "Adjusted Within R2", "wpr2" = "Within Pseudo R2", 
                 "wapr2" = "Adjusted Within Pseudo R2")
  
  type_allowed = c("cor2", "r2", "ar2", "pr2", "apr2", "wr2", "war2", "wpr2", "wapr2")
  types_alias = c(par2 = "apr2", awr2 = "war2", pwr2 = "wpr2", wpar2 = "wapr2", 
                  pwar2 = "wapr2", pawr2 = "wapr2", apwr2 = "wapr2", awpr2 = "wapr2", 
                  sq.cor = "cor2")
  
  if("all" %in% type){
    type_all = type_allowed
  } else {
    type_all = tolower(type)
    pblm = setdiff(type_all, c(type_allowed, names(types_alias)))
    if(length(pblm) > 0){
      stopi("The r2 type{$s, enum.q, is ? pblm} not valid.")
    }
  }

  type_all = dict_apply(type_all, types_alias)

  is_ols = x$method_type == "feols"
  isFixef = "fixef_vars" %in% names(x)
  n = nobs(x)

  res = rep(NA, length(type_all))
  for(i in seq_along(type_all)){
    myType = type_all[i]

    if(myType == "cor2"){
      res[i] = x$sq.cor
      next
    }

    if(!grepl("p", myType) && !is_ols){
      # non pseudo R2 not valid for non ols
      next
    }

    if(grepl("w", myType) && !isFixef){
      # within R2 not valid for models without FE
      next
    }

    adj = grepl("a", myType)
    pseudo = grepl("p", myType)
    within = grepl("w", myType) && isFixef
    ifNullNA = function(x) ifelse(is.null(x), NA, x)
    if(within && isFixef){

      if(isTRUE(x$onlyFixef)) next

      if(is.null(x$ssr_fe_only) && !is.null(x$fixef_vars)){
        # This is the case of feglm where there were no fe_only model estimated
        # => we need to compute the FE model first

        # 2019-11-26: now self contained call (no need for outer frame evaluation)

        if(isTRUE(x$lean)){
          stop("Within R2s are not available for 'lean' fixest objects. Please reestimate with 'lean = FALSE'.")
        }

        # constructing the data
        newdata = cbind(data.frame(y = x$y), as.data.frame(x$fixef_id))
        if(!is.null(x$fixef_terms)){
          newdata = cbind(newdata, as.data.frame(x$slope_variables))
        }
        # Fe/slope only formula
        new_fml = merge_fml(y ~ 1, x$fml_all$fixef)

        # x$family$family is also normal
        res_fe = feglm(fml = new_fml, data = newdata, glm.tol = 1e-2, 
                       fixef.tol = 1e-3, family = x$family$family, 
                       weights = x$weights, offset = x$offset)

        x$ssr_fe_only = cpp_ssq(res_fe$residuals)
        x$ll_fe_only = logLik(res_fe)
      }

      # within
      df_k = ifelse(adj, length(coef(x)), 0)
      if(pseudo){
        ll_fe_only = ifNullNA(x$ll_fe_only)
        ll = logLik(x)
        res[i] = 1 - (ll - df_k) / ll_fe_only
      } else {
        ssr_fe_only = ifNullNA(x$ssr_fe_only)
        nb_fe = x$nparams - length(coef(x))
        res[i] = 1 - x$ssr / ssr_fe_only * (n - nb_fe) / (n - nb_fe - df_k)
      }
    } else {

      if(x$nparams == 1 && string_any(names(x$coefficients), "f/(Intercept)")){
        res[i] = NA

      } else {
        df_k = ifelse(adj, x$nparams, 1 - pseudo)
        if(pseudo){
          ll_null = x$ll_null
          ll = logLik(x)
          if(adj){
            res[i] = 1 - (ll - x$nparams + 1) / ll_null
          } else {
            res[i] = 1 - ll / ll_null
          }
        } else {
          ssr_null = x$ssr_null
          df.intercept = 1 * (isFixef || string_any(names(x$coefficients), "f/(Intercept)"))
          res[i] = 1 - x$ssr / ssr_null * (n - df.intercept) / (n - df_k)
        }
      }

    }
  }

  if(full_names){
    names(res) = dict_apply(type_all, dict_names)
  } else {
    names(res) = type_all
  }


  res
}


#' Gets the degrees of freedom of a `fixest` estimation
#'
#' Simple utility to extract the degrees of freedom from a `fixest` estimation.
#'
#' @inheritParams vcov.fixest
#'
#' @param x A `fixest` estimation.
#' @param type Character scalar, equal to "k", "resid", "t". If "k", then the number of 
#' regressors is returned. If "resid", then it is the "residuals degree of freedom", i.e. 
#' the number of observations minus the number of regressors. If "t", it is the degrees of 
#' freedom used in the t-test. Note that these values are affected by how the VCOV of `x` 
#' is computed, in particular when the VCOV is clustered.
#' @param vars A vector of variable names, of the regressors. This is optional. If provided, 
#' then `type` is set to 1 by default and the number of regressors contained in `vars` 
#' is returned. This is only useful in the presence of collinearity and we want a subset of 
#' the regressors only. (Mostly for internal use.)
#' @param stage Either 1 or 2. Only concerns IV regressions, which stage to look at.
#'
#' The type of VCOV can have an influence on the degrees of freedom. In particular, when the 
#' VCOV is clustered, the DoF returned will be in accordance with the way the small 
#' sample correction was performed when computing the VCOV. That type of value is in general 
#' not what we have in mind when we think of "degrees of freedom". To obtain the ones that are 
#' more intuitive, please use `degrees_freedom_iid` instead.
#'
#'
#' @examples
#'
#' # First: an estimation
#'
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#' est = feols(y ~ x1 + x2 | species, base)
#'
#' # "Normal" standard-errors (SE)
#' est_standard = summary(est, se = "st")
#'
#' # Clustered SEs
#' est_clustered = summary(est, se = "clu")
#'
#' # The different degrees of freedom
#'
#' # => different type 1 DoF (because of the clustering)
#' degrees_freedom(est_standard, type = "k")
#' degrees_freedom(est_clustered, type = "k") # fixed-effects are excluded
#'
#' # => different type 2 DoF (because of the clustering)
#' degrees_freedom(est_standard, type = "resid") # => equivalent to the df.residual from lm
#' degrees_freedom(est_clustered, type = "resid")
#'
#'
#'
degrees_freedom = function(x, type, vars = NULL, vcov = NULL, se = NULL, cluster = NULL, ssc = NULL, stage = 2){
  check_arg(x, "class(fixest) mbt")
  check_set_arg(type, "match(k, resid, t)")
  check_arg(stage, "integer scalar GE{1} LE{2}")
  check_arg(vars, "character vector no na")

  if(stage == 1 && isTRUE(x$iv) && x$iv_stage == 2){
    x = x$iv_first_stage[[1]]
  }

  if(!missnull(vars)){
    if(!missing(type) && type != "k"){
      warning("The argument 'type' is ignored when the argument 'vars' is present. Type 'k' is returned.")
    }

    vars_keep = intersect(vars, names(x$coefficients))
    return(length(vars_keep))
  }

  if(missing(type)){
    stop("The argument 'type' is required but is currently missing.")
  }

  if(!isTRUE(x$summary) || !missnull(vcov) || !missnull(se) || !missnull(cluster) || !missnull(ssc)){
    x = summary(x, vcov = vcov, se = se, cluster = cluster, ssc = ssc)
  }

  vcov = x$cov.scaled

  if(is.null(vcov)){
    dof.K = x$nparams
    df.t = nobs(x) - dof.K
  } else {
    # From v0.10.0 onward, "dt.t" is always provided!
    df.t = attr(vcov, "df.t")
    dof.K = attr(vcov, "dof.K")
  }

  res = switch(type,
         "k" = dof.K,
         "resid" = max(nobs(x) - dof.K, 1),
         "t" = df.t)

  res
}


#' @describeIn degrees_freedom Gets the degrees of freedom of a `fixest` estimation
degrees_freedom_iid = function(x, type){
  degrees_freedom(x, type, vcov = "iid")
}

####
#### Stats -- internal ####
####

.wald = function(x, var){
  # x: fixest estimation

  coef = x$coefficients

  vcov = x$cov.scaled
  if(is.null(vcov)){
    # => INTERNAL ERROR
    stop("INTERNAL ERROR: .wald should be applied only to objects with a VCOV already computed. Could you report the error to the maintainer of fixest?")
  }

  var_keep = intersect(var, names(coef))

  if(length(var_keep) == 0){
    # All vars removed bc of collin => stat = 0
    return(0)
  }

  # To handle errors (rare but can happen)
  if(any(diag(vcov) < 1e-10)){
    vcov = mat_posdef_fix(vcov)
  }

  chol_decomp = cpp_cholesky(vcov[var_keep, var_keep, drop = FALSE])
  vcov_inv = chol_decomp$XtX_inv

  if(isTRUE(chol_decomp$all_removed)){
    # Can happen for indicators + clustering at the same level
    return(1e6)
  }

  if(any(chol_decomp$id_excl)){
    var_keep = var_keep[!chol_decomp$id_excl]
  }

  my_coef = coef[var_keep]

  stat = drop(my_coef %*% vcov_inv %*% my_coef) / length(my_coef)

  stat
}


kp_stat = function(x){
  # internal function => x must be a fixest object
  #
  # The code here is a translation of the ranktest.jl function from the Vcov.jl package
  # from @matthieugomez (see https://github.com/matthieugomez/Vcov.jl)
  #


  if(!isTRUE(x$iv) || !x$iv_stage == 2) return(NA)

  # Necessary data

  X_proj = as.matrix(resid(summary(x, stage = 1)))

  # while the projection of X has already been computed, we need to do it
  # for Z => that's a pain in the neck
  # computational cost is much lower if I include the KP computation at
  # estimation time.... since here we do the job twice... we'll see

  Z = model.matrix(x, type = "iv.inst")
  Z_proj = proj_on_U(x, Z)

  k = n_endo = ncol(X_proj)
  l = n_inst = ncol(Z)

  # We assume l >= k
  q = min(k, l) - 1

  # Let's go

  if(n_endo == 1){
    PI = t(coef(summary(x, stage = 1)))
  } else {
    PI = coef(summary(x, stage = 1))
  }
  PI = PI[, colnames(PI) %in% x$iv_inst_names_xpd, drop = FALSE]

  Fmat = chol(crossprod(Z_proj))
  Gmat = chol(crossprod(X_proj))
  theta = Fmat %*% t(solve(t(Gmat)) %*% PI)
  # theta: n_inst x n_endo

  if(n_inst == n_endo){
    svd_decomp = svd(theta)
    u = svd_decomp$u
    vt = t(svd_decomp$v)
  } else {
    # we need full decomp => un optimized decomp
    svd_decomp = mat_svd(theta)
    u = svd_decomp$u
    vt = svd_decomp$vt
  }

  u_sub = u[k:l, k:l, drop = FALSE]
  vt_sub = vt[k, k]

  vt_k = vt[1:k, k, drop = FALSE]

  ssign = function(x) if(x == 0) 1 else sign(x)

  # There may be more sign problems here
  if(k == l){
    a_qq = ssign(u_sub[1]) * u[1:l, k:l, drop = FALSE]
    b_qq = ssign(vt_sub[1]) * t(vt_k)
  } else {
    a_qq = u[1:l, k:l] %*% (solve(u_sub) %*% mat_sqrt(u_sub %*% t(u_sub)))
    b_qq = mat_sqrt(vt_sub %*% t(vt_sub)) %*% (solve(t(vt_sub)) %*% t(vt_k))
  }

  # kronecker
  kronv = kronecker(b_qq, t(a_qq))
  lambda = kronv %*% c(theta)

  # There is need to compute the vcov specifically for this case
  # We do it the same way as it was for x

  VCOV_TYPE = attr(x$cov.scaled, "type")

  if(identical(VCOV_TYPE, "IID")){
    vlab = chol(tcrossprod(kronv) / nrow(X_proj))

  } else {
    K = t(kronecker(Gmat, Fmat))

    my_scores = do.call(cbind, lapply(1:ncol(X_proj), function(i) Z_proj * X_proj[, i]))

    x_new = x
    x_new$scores = my_scores

    vcov = x$summary_flags$vcov
    ssc = x$summary_flags$ssc

    meat = vcov(x_new, vcov = vcov, ssc = ssc, sandwich = FALSE)
    vhat = solve(K, t(solve(K, meat)))

    # DOF correction now
    n = nobs(x) - identical(VCOV_TYPE, "cluster")
    df_resid = degrees_freedom(x, "resid", stage = 1)
    vhat = vhat * n / df_resid

    vlab = kronv %*% vhat %*% t(kronv)
  }

  r_kp = t(lambda) %*% solve(vlab, lambda)
  # cat("KP r:\n") ; print(r_kp)

  # Now the results
  kp_df = n_inst - n_endo + 1
  kp_f = r_kp / n_inst
  kp_p = pchisq(kp_f, kp_df, lower.tail = FALSE)

  list(stat = kp_f, p = kp_p, df = kp_df)
}

cd_stat = function(x){
  # internal function
  # x: fixest object
  #

  if(!isTRUE(x$iv) || !x$iv_stage == 2) return(NA)

  # Necessary data
  X = model.matrix(x, type = "iv.endo")
  X_proj = proj_on_U(x, X)

  Z = model.matrix(x, type = "iv.inst")
  Z_proj = proj_on_U(x, Z)

  n = nobs(x)
  n_endo = NCOL(X_proj)
  n_inst = NCOL(Z_proj)

  # The stat
  if(FALSE){
    # => just use the canonical correlation, it's faster

    df_resid = degrees_freedom(x, resid, stage = 1)
    V = resid(summary(x, stage = 1))

    P_Z_proj = Z_proj %*% solve(crossprod(Z_proj)) %*% t(Z_proj)

    SIGMA_vv_hat = (t(X) %*% V) / df_resid

    inv_sqrt_SIGMA_vv_hat = solve(mat_sqrt(SIGMA_vv_hat))

    G = (t(inv_sqrt_SIGMA_vv_hat) %*% t(X_proj) %*% P_Z_proj %*% X_proj %*% inv_sqrt_SIGMA_vv_hat) / n_inst

    cd = min(eigen(G)$values)

  } else {
    cc = min(cancor(X_proj, Z_proj)$cor)
    cd = ((n - n_endo - n_inst - 1) / n_inst) / ((1 - cc^2) / cc^2)
  }

  cd
}


####
#### Misc internal funs ####
####

mat_sqrt = function(A){
  e = eigen(A)
  # e$vectors %*% diag(x = sqrt(e$values), nrow = nrow(A)) %*% t(e$vectors)
  ev = pmax(e$values, 1e-10)
  M = e$vectors %*% diag(x = ev**.25, nrow = nrow(A))
  tcrossprod(M)
}


mat_svd = function(A){
  # From https://rpubs.com/aaronsc32/singular-value-decomposition-r
  # https://math.stackexchange.com/questions/2359992/how-to-resolve-the-sign-issue-in-a-svd-problem

  ATA = crossprod(A)
  ATA.e = eigen(ATA)
  v = ATA.e$vectors

  AAT = tcrossprod(A)
  AAT.e = eigen(AAT)
  u = AAT.e$vectors
  r = sqrt(ATA.e$values)

  # we want the last diag element of vt to be positive
  vt = t(v)
  k = nrow(vt)
  if(nrow(u) == k && vt[k, k] < 0){
    vt[k, ] = - vt[k, ]
    u[, k] = - u[, k]
  }

  list(u = u, vt = vt, d = r)
}





proj_on_U = function(x, Z){
  # Projects some variables (either the endo or the inst) on the
  # exogenous variables (U)
  #
  # x: a fixest IV estimation
  #
  # I write Z in the arguments, but it can be any matrix (like X)
  #

  U = model.matrix(x, type = "iv.exo")
  is_U = !is.null(U)

  if(!is.null(x$fixef_vars)){
    # we need to center the variables
    # Of course, if there are varying slopes, that's a pain in the neck

    if(!is.null(x$slope_flag)){
      Z_dm = demean(Z, x$fixef_id[order(x$fixef_sizes, decreasing = TRUE)],
                    slope.vars = x$slope_variables_reordered,
                    slope.flag = x$slope_flag_reordered)

      if(is_U){
        U_dm = demean(U, x$fixef_id[order(x$fixef_sizes, decreasing = TRUE)],
                      slope.vars = x$slope_variables_reordered,
                      slope.flag = x$slope_flag_reordered)
      }

    } else {
      Z_dm = demean(Z, x$fixef_id)
      if(is_U){
        U_dm = demean(U, x$fixef_id)
      }
    }

    if(is_U){
      Z_proj = resid(feols.fit(Z_dm, U_dm))
    } else {
      Z_proj = Z_dm
    }

  } else if(is_U){
    Z_proj = resid(feols.fit(Z, U))
  } else {
    Z_proj = Z
  }

  Z_proj
}

Try the fixest package in your browser

Any scripts or data that you put into this service are public.

fixest documentation built on June 22, 2024, 9:12 a.m.