# R/fitstats.R In fixest: Fast Fixed-Effects Estimations

#### Documented in degrees_freedomdegrees_freedom_iidfitstatfitstat_registerprint.fixest_fitstatr2wald

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Mon Apr 19 17:23:31 2021
# ~: Various fit statistics
#----------------------------------------------#

####
#### stats -- User level ####
####

#' Print method for fit statistics of fixest estimations
#'
#' Displays a brief summary of selected fit statistics from the function [fitstat].
#'
#' @inherit fitstat examples
#'
#' @param x An object resulting from the [fitstat] function.
#' @param na.rm Logical, default is FALSE. If TRUE, the statistics that are missing are not displayed.
#' @param ... Not currently used.
#'
#'
print.fixest_fitstat = function(x, na.rm = FALSE, ...){

dots = list(...)

# glue = function(x) gsub("( +)(,)", "\\2\\1", paste(x[nchar(x) > 0], collapse = ", "))
glue = function(x) paste(x[nchar(x) > 0], collapse = ", ")

all_types = fitstat(give_types = TRUE)
dict_type = all_types$R_alias if(na.rm){ IS_NA = sapply(x, function(z) identical(z, NA)) x = x[!IS_NA] if(length(x) == 0) return(invisible(NULL)) } if(isTRUE(dots$group.solo)){
# Later integration in print
solo = lengths(x) == 1
x_solo = unlist(x[solo])
n_solo = length(x_solo)

if(n_solo > 1){
x_solo_names = names(x[solo])
x_solo = unlist(x[solo])

x = x[!solo]

nr = ceiling(n_solo / 2)
m = matrix(c(x_solo, NA)[1:(2*nr)], nr, 2, byrow = TRUE)
m_alias = matrix(c(unlist(dict_type[x_solo_names]), "")[1:(2*nr)], nr, 2, byrow = TRUE)

if(nr > 1 && anyNA(m[nr, ])){
m[nr, ] = m[nr, 2:1]
m_alias[nr, ] = m_alias[nr, 2:1]
}

cols_new = list()
for(i in 1:2){
col = m[,i]
gt1 = abs(col) > 1 & !is.na(col)
col_char = numberFormatNormal(col)
col_char[gt1] = sfill(col_char[gt1], anchor = ".", na = "")
col_char[!gt1] = sfill(col_char[!gt1], anchor = ".", na = "")
col_char = sfill(col_char, right = TRUE)

# The aliases
col_char = paste0(sfill(m_alias[, i]), ": ", col_char)
if(anyNA(col)){
col_char[is.na(col)] = sfill("", nchar(col_char[1]))
}

cols_new[[i]] = col_char
}
mm = do.call(cbind, cols_new)

cat(apply(mm, 1, paste, collapse = "   "), sep = "\n")
}

if(length(x) == 0) return(invisible(NULL))
}

res = c()

# Formatting the stats and pvalues
is_stat = sapply(x, function(z) "stat" %in% names(z))
is_pval = sapply(x, function(z) "p" %in% names(z))

if(any(is_stat)){
stat_all = sapply(x[is_stat], function(z) z$stat) stat_all = paste0("stat = ", sfill(sfill(numberFormatNormal(stat_all), anchor = "."), right = TRUE)) for(i in seq_along(stat_all)){ j = which(is_stat)[i] x[[j]]$stat = stat_all[i]
}
}

if(any(is_pval)){
pval_all = sapply(x[is_pval], function(z) z$p) low = pval_all < 2.2e-16 & !is.na(pval_all) pval_all[low] = 2.2e-16 pval_all = paste0("p ", ifelse(low, "< ", "= "), sfill(numberFormatNormal(pval_all), right = TRUE)) for(i in seq_along(pval_all)){ j = which(is_pval)[i] x[[j]]$p = pval_all[i]
}
}

# formatting for multiple endo regs
qui_right = rep(FALSE, length(x))

same_as = function(x, y) isTRUE(all.equal(x, y, check.attributes = FALSE))

for(i in seq_along(x)){
type = names(x)[i]

v = x[[i]]

if(grepl("::", type, fixed = TRUE)){
dict = getFixest_dict()
rename_fun = function(x) paste0(dict_type[x[1]], ", ", paste(sapply(x[-1], dict_apply, dict = dict), collapse = "::"))
test_name = rename_fun(strsplit(type, "::")[[1]])
qui_right[i] = TRUE
} else {
if(type %in% names(dict_type)){
test_name = dict_type[type]
} else if(!grepl(".", type, fixed = TRUE)) {
warning("Current type '", type, "' has no alias.")
test_name = type
} else {
type_split = strsplit(type, ".", fixed = TRUE)[[1]]
test_name = paste0(dict_type[type_split[1]], ", ", dict_type[type_split[2]])
}

}

if(length(v) == 1){
# Basic display
res[i] = paste0(test_name, "! ", numberFormatNormal(v))

} else if(type %in% all_types$user_types){ # we can have extra values for user defined functions opts = getOption("fixest_fitstat_user") alias_subtypes = opts[[type]]$alias_subtypes

arg_list = list()
skip_df2 = FALSE
for(k in seq_along(v)){
subtype = names(v)[k]

if(subtype == "df2" && skip_df2) next

if(subtype == "stat"){
arg_list[[k]] = v$stat } else if(subtype == "p"){ arg_list[[k]] = v$p

} else if(subtype == "df" && same_as(alias_subtypes["df"], "df")){
arg_list[[k]] = paste0("on ", numberFormatNormal(v$df), " DoF") } else if(subtype == "df1" && "df2" %in% names(v) && same_as(alias_subtypes["df1"], "df1")){ arg_list[[k]] = paste0("on ", numberFormatNormal(v$df1), " and ", numberFormatNormal(v$df2), " DoF") skip_df2 = TRUE } else if(subtype == "vcov"){ arg_list[[k]] = paste0(alias_subtypes[subtype], ": ", v$vcov)

} else {
arg_list[[k]] = paste0(alias_subtypes[subtype], " = ", numberFormatNormal(v[[subtype]]))

}
}

res[i] = paste0(test_name, "! ", glue(arg_list), ".")

} else {

stat_line = p_line = dof_line = vcov_line = ""
if(!is.null(v$stat)) stat_line = v$stat
if(!is.null(v$p)) p_line = v$p
if(!is.null(v$df)) dof_line = paste0("on ", numberFormatNormal(v$df), " DoF")
if(!is.null(v$df1)) dof_line = paste0("on ", numberFormatNormal(v$df1), " and ", numberFormatNormal(v$df2), " DoF") if(!is.null(v$vcov)) vcov_line = paste0("VCOV: ", v$vcov) res[i] = paste0(test_name, "! ", glue(c(stat_line, p_line, dof_line, vcov_line)), ".") } } if(any(qui_right)){ res2fix = strsplit(res[qui_right], "!", fixed = TRUE) left = sapply(res2fix, function(x) x[1]) right = sapply(res2fix, function(x) x[2]) res_fixed = paste0(sfill(left, right = TRUE), "!", right) res[qui_right] = res_fixed } cat(gsub("!", ":", sfill(res, anchor = "!"), fixed = TRUE), sep = "\n") } #' Register custom fit statistics #' #' Enables the registration of custom fi statistics that can be easily summoned with the function [fitstat]. #' #' @inherit fitstat seealso #' #' @param type A character scalar giving the type-name. #' @param fun A function to be applied to a fixest estimation. It must return either a scalar, or a list of unitary elements. If the number of elements returned is greater than 1, then each element must be named! If the fit statistic is not valid for a given estimation, a plain NA value should be returned. #' @param alias A (named) character vector. An alias to be used in lieu of the type name in the display methods (ie when used in [print.fixest_fitstat] or [etable]). If the function returns several values, i.e. sub-types, you can give an alias to these sub-types. The syntax is c("type" = "alias", "subtype_i" = "alias_i"), with "type" (resp. "subtype") the value of the argument type resp. (subtypes). You can also give an alias encompassing the type and sub-type with the syntax c("type.subtype_i" = "alias"). #' @param subtypes A character vector giving the name of each element returned by the function fun. This is only used when the function returns more than one value. Note that you can use the shortcut "test" when the sub-types are "stat", "p" and "df"; and "test2" when these are "stat", "p", "df1" and "df2". #' #' @details #' If there are several components to the computed statistics (i.e. the function returns several elements), then using the argument subtypes, giving the names of each of these components, is mandatory. This is to ensure that the statistic can be used as any other built-in statistic (and there are too many edge cases impeding automatic deduction). #' #' @author Laurent Berge #' #' @examples #' #' # An estimation #' base = iris #' names(base) = c("y", "x1", "x2", "x3", "species") #' est = feols(y ~ x1 + x2 | species, base) #' #' # #' # single valued tests #' # #' #' # say you want to add the coefficient of variation of the dependent variable #' cv = function(est){ #' y = model.matrix(est, type = "lhs") #' sd(y)/mean(y) #' } #' #' # Now we register the routine #' fitstat_register("cvy", cv, "Coef. of Variation (dep. var.)") #' #' # now we can summon the registered routine with its type ("cvy") #' fitstat(est, "cvy") #' #' # #' # Multi valued tests #' # #' #' # Let's say you want a Wald test with an heteroskedasticiy robust variance #' #' # First we create the function #' hc_wald = function(est){ #' w = wald(est, keep = "!Intercept", print = FALSE, se = "hetero") #' head(w, 4) #' } #' # This test returns a vector of 4 elements: stat, p, df1 and df2 #' #' # Now we register the routine #' fitstat_register("hc_wald", hc_wald, "Wald (HC1)", "test2") #' #' # You can access the statistic, as before #' fitstat(est, "hc_wald") #' #' # But you can also access the sub elements #' fitstat(est, "hc_wald.p") #' fitstat_register = function(type, fun, alias = NULL, subtypes = NULL){ check_arg(type, "character scalar mbt") check_arg(fun, "function mbt") check_arg(alias, "NULL character vector no na") check_arg(subtypes, "NULL character vector no na") # We check the type is not conflicting existing_types = fitstat(give_types = TRUE)$types

opts = getOption("fixest_fitstat_user")

if(type %in% setdiff(existing_types, names(opts))){
stop("The type name '", type, "' is the same as one built-in type. Please choose another one.")
}

# Alias:
# - type_name = alias_name          => only the type
# - subtype_name = alias_name       => only the subtype
# - type.subtype_name = alias_name  => full type name

# if alias is a character vector WITHOUT name, then only the type is named

if(identical(subtypes, "test")){
subtypes = c("stat", "p", "df")
} else if(identical(subtypes, "test2")){
subtypes = c("stat", "p", "df1", "df2")
}

IS_SUB = !is.null(subtypes)

if(missnull(alias)){
alias = type
names(alias) = type

} else {
# Handling all the cases is a bit of a pain

if(is.null(names(alias))){

if(length(alias) == 1){
tmp = alias
names(tmp) = type
alias = tmp

} else if(length(alias) == (1 + length(subtypes))){
# Implicit naming
tmp = alias
names(tmp) = c(type, subtypes)
alias = tmp

} else {
# We have a problem here, we can't infer implicitly, too arbitrary
if(IS_SUB){
check_value(alias, "character scalar", .message = "The alias should be a character scalar.")
} else {
stop("To define the aliases, please use a named character vector, with the names being the codes, and the values the aliases. \n",
"E.g.: alias = c(\"", type, "\" = \"alias_1\", \"", subtypes[1], "\" = \"alias_2\").")
}
}

} else {
alias_names = names(alias)

is_0 = which(nchar(alias_names) == 0)
if(length(is_0) > 0){
# checking the problems
if(length(is_0) > 1){
stop("In the argument 'alias': only the main type can be implicitly set and should come first (they are several implicitly defined atm). Please explicitly use names for each type/subtype you want.")

} else if(is_0 != 1){
stop("In the argument 'alias': only the main type can be implicitly set and should come first (it is currenly not first). Please explicitly use names for each type/subtype you want.")

}

names(alias)[1] = type
alias_names = names(alias)
}

# We check that the names are correct
possible_types = type
if(IS_SUB){
possible_types = c(possible_types, subtypes, paste0(type, ".", subtypes))
}

check_value_plus(alias_names, "multi match", .choices = possible_types, .message = "In argument 'alias', the names should be equal to the type, the subtypes, or of the form 'type.subtype'.")
names(alias) = alias_names
}
}

# We create the full list of aliases
alias_subtypes = NULL
if(IS_SUB){

# subtype aliases => I add some default values if not provided

default_sub_aliases = setNames(subtypes, subtypes)
default_sub_aliases[c("stat", "p")] = c("stat.", "p-value")

# Note that the names are not kept when NA.. ||
alias_subtypes = alias[subtypes] #           ||
qui_NA = is.na(alias_subtypes)   #           ||
alias_subtypes = alias_subtypes[!qui_NA] #   | = > I need to clean them

alias_subtypes[subtypes[qui_NA]] = default_sub_aliases[subtypes[qui_NA]]

# Main aliases

if(type %in% names(alias)){
alias_main = alias[type]
} else  {
alias_main = setNames(type, type)
}

# The full types
full_types = paste0(type, ".", subtypes)
alias_full = alias[full_types]
qui_NA = is.na(alias_full)
alias_full = alias_full[!qui_NA]

alias_full[full_types[qui_NA]] = paste0(alias_main, ", ", alias_subtypes[subtypes[qui_NA]])

alias_main = c(alias_main, alias_full)

} else {
alias_main = alias
}

res = list(fun = fun, alias = alias_main, alias_subtypes = alias_subtypes)

opts[[type]] = res

options(fixest_fitstat_user = opts)

invisible(NULL)
}

#' Computes fit statistics of fixest objects
#'
#' Computes various fit statistics for fixest estimations.
#'
#' @param x A fixest estimation.
#' @param type Character vector or one sided formula. The type of fit statistic to be computed. The classic ones are: n, rmse, r2, pr2, f, wald, ivf, ivwald. You have the full list in the details section or use show_types = TRUE. Further, you can register your own types with [fitstat_register].
#' @param simplify Logical, default is FALSE. By default a list is returned whose names are the selected types. If simplify = TRUE and only one type is selected, then the element is directly returned (ie will not be nested in a list).
#' @param verbose Logical, default is TRUE. If TRUE, an object of class fixest_fitstat is returned (so its associated print method will be triggered). If FALSE a simple list is returned instead.
#' @param show_types Logical, default is FALSE. If TRUE, only prompts all available types.
#' @param frame An environment in which to evaluate variables, default is parent.frame(). Only used if the argument type is a formula and some values in the formula have to be extended with the dot square bracket operator. Mostly for internal use.
#' @param ... Other elements to be passed to other methods and may be used to compute the statistics (for example you can pass on arguments to compute the VCOV when using type = "g" or type = "wald".).
#'
#' @section Registering your own types:
#'
#' You can register custom fit statistics with the function fitstat_register.
#'
#' @section Available types:
#'
#' The types are case sensitive, please use lower case only. The types available are:
#'
#' \describe{
#' \item{n, ll, aic, bic, rmse: }{The number of observations, the log-likelihood, the AIC, the BIC and the root mean squared error, respectively.}
#' \item{my: }{Mean of the dependent variable.}
#' \item{g: }{The degrees of freedom used to compute the t-test (it influences the p-values of the coefficients). When the VCOV is clustered, this value is equal to the minimum cluster size, otherwise, it is equal to the sample size minus the number of variables.}
#' \item{r2, ar2, wr2, awr2, pr2, apr2, wpr2, awpr2: }{All r2 that can be obtained with the function [r2]. The a stands for 'adjusted', the w for 'within' and the p for 'pseudo'. Note that the order of the letters a, w and p does not matter. The pseudo R2s are McFadden's R2s (ratios of log-likelihoods).}
#' \item{theta: }{The over-dispersion parameter in Negative Binomial models. Low values mean high overdispersion.}
#' \item{f, wf: }{The F-tests of nullity of the coefficients. The w stands for 'within'. These types return the following values: stat, p, df1 and df2. If you want to display only one of these, use their name after a dot: e.g. f.stat will give the statistic of the F-test, or wf.p will give the p-values of the F-test on the projected model (i.e. projected onto the fixed-effects).}
#' \item{wald: }{Wald test of joint nullity of the coefficients. This test always excludes the intercept and the fixed-effects. These type returns the following values: stat, p, df1, df2 and vcov. The element vcov reports the way the VCOV matrix was computed since it directly influences this statistic.}
#' \item{ivf, ivf1, ivf2, ivfall: }{These statistics are specific to IV estimations. They report either the IV F-test (namely the Cragg-Donald F statistic in the presence of only one endogenous regressor) of the first stage (ivf or ivf1), of the second stage (ivf2) or of both (ivfall). The F-test of the first stage is commonly named weak instrument test. The value of ivfall is only useful in [etable] when both the 1st and 2nd stages are displayed (it leads to the 1st stage F-test(s) to be displayed on the 1st stage estimation(s), and the 2nd stage one on the 2nd stage estimation -- otherwise, ivf1 would also be displayed on the 2nd stage estimation). These types return the following values: stat, p, df1 and df2.}
#' \item{ivwald, ivwald1, ivwald2, ivwaldall: }{These statistics are specific to IV estimations. They report either the IV Wald-test of the first stage (ivwald or ivwald1), of the second stage (ivwald2) or of both (ivwaldall). The Wald-test of the first stage is commonly named weak instrument test. Note that if the estimation was done with a robust VCOV and there is only one endogenous regressor, this is equivalent to the Kleibergen-Paap statistic. The value of ivwaldall is only useful in [etable] when both the 1st and 2nd stages are displayed (it leads to the 1st stage Wald-test(s) to be displayed on the 1st stage estimation(s), and the 2nd stage one on the 2nd stage estimation -- otherwise, ivwald1 would also be displayed on the 2nd stage estimation). These types return the following values: stat, p, df1, df2, and vcov.}
#' \item{cd: }{The Cragg-Donald test for weak instruments.}
#' \item{kpr: }{The Kleibergen-Paap test for weak instruments.}
#' \item{wh: }{This statistic is specific to IV estimations. Wu-Hausman endogeneity test. H0 is the absence of endogeneity of the instrumented variables. It returns the following values: stat, p, df1, df2.}
#' \item{sargan: }{Sargan test of overidentifying restrictions. H0: the instruments are not correlated with the second stage residuals. It returns the following values: stat, p, df.}
#' \item{lr, wlr: }{Likelihood ratio and within likelihood ratio tests. It returns the following elements: stat, p, df. Concerning the within-LR test, note that, contrary to estimations with femlm or feNmlm, estimations with feglm/fepois need to estimate the model with fixed-effects only which may prove time-consuming (depending on your model). Bottom line, if you really need the within-LR and estimate a Poisson model, use femlm instead of fepois (the former uses direct ML maximization for which the only FEs model is a by product).}
#' }
#'
#'
#'
#'
#' @return
#' By default an object of class fixest_fitstat is returned. Using verbose = FALSE returns a simple a list. Finally, if only one type is selected, simplify = TRUE leads to the selected type to be returned.
#'
#' @examples
#'
#' gravity = feols(log(Euros) ~ log(dist_km) | Destination + Origin, trade)
#'
#' # Extracting the 'working' number of observations used to compute the pvalues
#' fitstat(gravity, "g", simplify = TRUE)
#'
#' # Some fit statistics
#' fitstat(gravity, ~ rmse + r2 + wald + wf)
#'
#' # You can use them in etable
#' etable(gravity, fitstat = ~ rmse + r2 + wald + wf)
#'
#' # For wald and wf, you could show the pvalue instead:
#' etable(gravity, fitstat = ~ rmse + r2 + wald.p + wf.p)
#'
#' # Now let's display some statistics that are not built-in
#' # => we use fitstat_register to create them
#'
#' # We need: a) type name, b) the function to be applied
#' #          c) (optional) an alias
#'
#' fitstat_register("tstand", function(x) tstat(x, se = "stand")[1], "t-stat (regular)")
#' fitstat_register("thc", function(x) tstat(x, se = "heter")[1], "t-stat (HC1)")
#' fitstat_register("t1w", function(x) tstat(x, se = "clus")[1], "t-stat (clustered)")
#' fitstat_register("t2w", function(x) tstat(x, se = "twow")[1], "t-stat (2-way)")
#'
#' # Now we can use these keywords in fitstat:
#' etable(gravity, fitstat = ~ . + tstand + thc + t1w + t2w)
#'
#' # Note that the custom stats we created are can easily lead
#' # to errors, but that's another story!
#'
#'
fitstat = function(x, type, simplify = FALSE, verbose = TRUE, show_types = FALSE,
frame = parent.frame(), ...){

r2_types = c("sq.cor", "cor2", "r2", "ar2", "pr2", "apr2", "par2", "wr2",
"war2", "awr2", "wpr2", "pwr2", "wapr2", "wpar2", "awpr2",
"apwr2", "pawr2", "pwar2")

opts = getOption("fixest_fitstat_user")
user_types = names(opts)

dots = list(...)

if(isTRUE(dots$give_types) || isTRUE(show_types)){ # Compound types => types yielding several values # F-test etc comp_types = c("f", "wf", "ivf", "ivf1", "ivf2", "ivfall", "wald", "ivwald", "ivwald1", "ivwald2", "ivwaldall", "wh", "sargan", "lr", "wlr", "kpr", "cd") full_comp_types = paste(comp_types, rep(c("stat", "p"), each = length(comp_types)), sep = ".") comp_alias = c(f = "F-test", wf = "F-test (projected)", ivfall = "F-test (IV only)", ivf1 = "F-test (1st stage)", ivf2 = "F-test (2nd stage)", wald = "Wald (joint nullity)", ivwaldall = "Wald (IV only)", ivwald1 = "Wald (1st stage)", ivwald2 = "Wald (2nd stage)", wh = "Wu-Hausman", sargan = "Sargan", lr = "LR", wlr = "LR (within)", df1 = "DoF (first)", df2 = "DoF (second)", df = "DoF", kpr = "Kleibergen-Paap", cd = "Cragg-Donald") my_names = paste(names(comp_alias), rep(c("stat", "p"), each = length(comp_alias)), sep = ".") full_comp_alias = setNames(paste0(comp_alias, ", ", rep(c("stat.", "p-value"), each = length(comp_alias))), my_names) # Having some trouble with aggregation post lapply due to names.... # I have a vectorized solution but it's really ugly => loop is clearer user_alias = c() for(i in seq_along(opts)){ user_alias = c(user_alias, opts[[i]]$alias)
}

user_types = names(user_alias)

# "Regular" types
valid_types = c("n", "ll", "aic", "bic", "rmse", "g", "my", "theta", r2_types, comp_types, full_comp_types, user_types)

tex_alias = c(n = "Observations", ll = "Log-Likelihood", aic = "AIC", bic = "BIC", my = "Dependent variable mean", g = "Size of the 'effective' sample", rmse = "RMSE", theta = "Over-dispersion", sq.cor = "Squared Correlation", cor2 = "Squared Correlation", r2="R$^2$", ar2="Adjusted R$^2$", pr2="Pseudo R$^2$", apr2="Adjusted Pseudo R$^2$", wr2="Within R$^2$", war2="Within Adjusted R$^2$", wpr2="Within Pseudo R$^2$", wapr2="Whithin Adjusted Pseudo R$^2$", comp_alias, full_comp_alias, user_alias)

R_alias = c(n = "Observations", ll = "Log-Likelihood", aic = "AIC", bic = "BIC", my = "Dep. Var. mean", g = "G", rmse = "RMSE", theta = "Over-dispersion", sq.cor = "Squared Cor.", cor2 = "Squared Cor.", r2="R2", ar2="Adj. R2", pr2="Pseudo R2", apr2="Adj. Pseudo R2", wr2="Within R2", war2="Within Adj. R2", wpr2="Within Pseudo R2", wapr2="Whithin Adj. Pseudo R2", comp_alias, full_comp_alias, user_alias)

# add r2 type alias
type_alias = c(ivf = "ivf1", ivf.stat = "ivf1.stat", ivf.p = "ivf1.p", ivwald = "ivwald1", ivwald.stat = "ivwald1", ivwald.p = "ivwald1.p", par2 = "apr2", awr2 = "war2", pwr2 = "wpr2", wpar2 = "wapr2", pwar2 = "wapr2", pawr2 = "wapr2", apwr2 = "wapr2", awpr2 = "wapr2", sq.cor = "cor2")

if(show_types){
cat("Available types:\n", paste(valid_types, collapse = ", "), "\n")
return(invisible(NULL))
}

res = list(types = valid_types, tex_alias = tex_alias, R_alias = R_alias, type_alias = type_alias, user_types = user_types)
return(res)
}

check_arg(x, "class(fixest) mbt")
check_arg_plus(type, "character vector no na | os formula")
check_arg(simplify, verbose, "logical scalar")

if("formula" %in% class(type)){
type = .xpd(type, frame = frame)

type = gsub(" ", "", strsplit(deparse_long(type[[2]]), "+", fixed = TRUE)[[1]])
}
type = tolower(type)

# To update
if(!isTRUE(x$summary) || any(c("se", "cluster", "ssc") %in% names(dots))){ x = summary(x, ...) } IS_ETABLE = isTRUE(dots$etable)
set_value = function(vec, value){
if(length(vec) == 1) return(vec)

if(value != ""){

if(!value %in% names(vec)){
stop_up("'", value, "' is not a valid component of the statistic '", root, "'. Only the following are valid: ", enumerate_items(names(vec), "quote"), ".")
}

return(vec[[value]])
} else if(IS_ETABLE){
return(vec[1])
} else {
return(vec)
}
}

res_all = list()
type_all = type

for(i in seq_along(type_all)){
type = type_all[i]

# Big if
if(type == "n"){
res_all[[type]] = x$nobs } else if(type == "ll"){ res_all[[type]] = logLik(x) } else if(type == "aic"){ res_all[[type]] = AIC(x) } else if(type == "bic"){ res_all[[type]] = BIC(x) } else if(type == "rmse"){ res_all[[type]] = if(!is.null(x$ssr)) sqrt(x$ssr / x$nobs) else sqrt(cpp_ssq(resid(x)) / x$nobs) } else if(type == "g"){ my_vcov = x$cov.scaled

G = attr(my_vcov, "G")
if(is.null(G)) G = x$nobs - x$nparams

res_all[[type]] = G

} else if(type == "my"){
res_all[[type]] = mean(model.matrix(x, type = "lhs"))

} else if(type == "theta"){
isNegbin = x$method == "fenegbin" || (x$method_type == "feNmlm" && x$family == "negbin") if(isNegbin){ theta = x$coefficients[".theta"]
names(theta) = "Overdispersion"
} else {
theta = NA
}

res_all[[type]] = theta

} else if(type %in% r2_types){
res_all[[type]] = r2(x, type)

} else {

#
# Types for which root.value is allowed
#

if(grepl(".", type, fixed = TRUE)){
root = gsub("\\..+", "", type)
value = gsub(".+\\.", "", type)
} else {
root = type
value = ""
}

# We need to normalize the types
from_to = c("ivwald" = "ivwald1", "ivf" = "ivf1")
if(root %in% names(from_to)){
type = gsub(root, from_to[root], type, fixed = TRUE)
root = from_to[root]
}

if(root == "f"){
if(!is.null(x$ssr)){ df1 = degrees_freedom(x, "k") - 1 df2 = degrees_freedom(x, "t") if(isTRUE(x$iv) && x$iv_stage == 2){ # We need to compute the SSR w = 1 if(!is.null(x$weights)) w = x$weights ssr = cpp_ssq(x$iv_residuals, w)
} else {
ssr = x$ssr } stat = ((x$ssr_null - ssr) / df1) / (ssr / df2)
p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
res_all[[type]] = set_value(vec, value)

} else {
res_all[[type]] = NA
}
} else if(root == "wf"){
df1 = length(x$coefficients) if(!is.null(x$ssr_fe_only) && df1 > 0){
df2 = x$nobs - x$nparams
stat = ((x$ssr_fe_only - x$ssr) / df1) / (x$ssr / df2) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root == "ivfall"){ if(isTRUE(x$iv)){
if(x$iv_stage == 1){ df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd)
df2 = degrees_freedom(x, "resid")

stat = ((x$ssr_no_inst - x$ssr) / df1) / (x$ssr / df2) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2) res_all[[type]] = set_value(vec, value) } else { # f stat for the second stage df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
df2 = degrees_freedom(x, "resid")

w = 1
if(!is.null(x$weights)) w = x$weights
ssr = cpp_ssq(x$iv_residuals, w) stat = ((x$ssr_no_endo - ssr) / df1) / (ssr / df2)
p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
res_all[[type]] = set_value(vec, value)
}

} else {
res_all[[type]] = NA
}

} else if(root == "ivf1"){

if(isTRUE(x$iv)){ df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd, stage = 1)
df2 = degrees_freedom(x, "resid", stage = 1)

if(x$iv_stage == 1){ stat = ((x$ssr_no_inst - x$ssr) / df1) / (x$ssr / df2)
p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
res_all[[type]] = set_value(vec, value)

} else {
x_first = x$iv_first_stage for(endo in names(x_first)){ stat = ((x_first[[endo]]$ssr_no_inst - x_first[[endo]]$ssr) / df1) / (x_first[[endo]]$ssr / df2)
p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2)
res_all[[paste0(type, "::", endo)]] = set_value(vec, value)
}
}

} else {
res_all[[type]] = NA
}

} else if(root == "ivf2"){
if(isTRUE(x$iv) && x$iv_stage == 2){
# f stat for the second stage

df1 = degrees_freedom(x, vars = x$iv_endo_names_fit) df2 = degrees_freedom(x, "resid") w = 1 if(!is.null(x$weights)) w = x$weights ssr = cpp_ssq(x$iv_residuals, w)

stat = ((x$ssr_no_endo - ssr) / df1) / (ssr / df2) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root == "kpr"){ if(isTRUE(x$iv) && x$iv_stage == 2){ # The KP rank test is computed in a specific function vec = kp_stat(x) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root == "cd"){ if(isTRUE(x$iv) && x$iv_stage == 2){ # The KP rank test is computed in a specific function vec = cd_stat(x) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root == "wald"){ # Joint nullity of the coefficients # if FE => on the projected model only qui = !names(x$coefficients) %in% "(Intercept)"
my_coef = x$coefficients[qui] df1 = length(my_coef) if(df1 > 0){ df2 = degrees_freedom(x, "resid") # The VCOV is always full rank in here stat = .wald(x, names(my_coef)) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
res_all[[type]] = set_value(vec, value)

} else {
# Only fixef
res_all[[type]] = NA
}

} else if(root == "ivwaldall"){

if(isTRUE(x$iv)){ if(x$iv_stage == 1){

df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd) df2 = degrees_freedom(x, "resid") stat = .wald(x, x$iv_inst_names_xpd)

p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type")) res_all[[type]] = set_value(vec, value) } else { # wald stat for the second stage df1 = degrees_freedom(x, vars = x$iv_endo_names_fit)
df2 = degrees_freedom(x, "resid")

stat = .wald(x, x$iv_endo_names_fit) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
res_all[[type]] = set_value(vec, value)
}

} else {
res_all[[type]] = NA
}

} else if(root %in% "ivwald1"){

if(isTRUE(x$iv)){ df1 = degrees_freedom(x, vars = x$iv_inst_names_xpd, stage = 1)
df2 = degrees_freedom(x, "resid", stage = 1)

if(x$iv_stage == 1){ stat = .wald(x, x$iv_inst_names_xpd)

p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type")) res_all[[type]] = set_value(vec, value) } else { x_first = x$iv_first_stage
inst = x$iv_inst_names_xpd for(endo in names(x_first)){ my_x_first = x_first[[endo]] if(is.null(my_x_first$cov.scaled)){
# We compute the VCOV like for the second stage
flags = x$summary_flags if(is.null(flags) || !is.null(dots$se) || !is.null(dots$cluster)){ my_x_first = summary(my_x_first, ...) } else { my_x_first = summary(my_x_first, vcov = flags$vcov, ssc = flags$ssc, ...) } } stat = .wald(my_x_first, inst) p = pf(stat, df1, df2, lower.tail = FALSE) vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type"))
res_all[[paste0(type, "::", endo)]] = set_value(vec, value)
}
}

} else {
res_all[[type]] = NA
}

} else if(root == "ivwald2"){
if(isTRUE(x$iv) && x$iv_stage == 2){
# wald stat for the second stage

df1 = degrees_freedom(x, vars = x$iv_endo_names_fit) df2 = degrees_freedom(x, "resid") stat = .wald(x, x$iv_endo_names_fit)

p = pf(stat, df1, df2, lower.tail = FALSE)
vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = attr(x$cov.scaled, "type")) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root == "wh"){ if(isTRUE(x$iv) && x$iv_stage == 2){ # Wu Hausman stat for the second stage vec = x$iv_wh
res_all[[type]] = set_value(vec, value)

} else {
res_all[[type]] = NA
}

} else if(root == "sargan"){
if(!is.null(x$iv_sargan)){ # Wu Hausman stat for the second stage vec = x$iv_sargan
res_all[[type]] = set_value(vec, value)

} else {
res_all[[type]] = NA
}

} else if(root == "lr"){
if(!is.null(x$ll_null)){ stat = 2 * (x$loglik - x$ll_null) df = x$nparams
p = pchisq(stat, df, lower.tail = FALSE)

vec = list(stat = stat, p = p, df = df)
res_all[[type]] = set_value(vec, value)

} else {
res_all[[type]] = NA
}

} else if(root == "wlr"){
if(!is.null(x$ll_fe_only) || (x$method_type == "feglm" && !is.null(x$fixef_id))){ if(x$method_type == "feglm"){
# estimation of the FE only model

newdata = cbind(data.frame(y = x$y), as.data.frame(x$fixef_id))
if(!is.null(x$fixef_terms)){ newdata = cbind(newdata, as.data.frame(x$slope_variables))
}
new_fml = merge_fml(y ~ 1, x$fml_all$fixef)
res_fe = feglm(fml = new_fml, data = newdata, glm.tol = 1e-2, fixef.tol = 1e-3, family = x$family$family, weights = x$weights, offset = x$offset)

ll_fe_only = logLik(res_fe)
} else {
ll_fe_only = x$ll_fe_only } stat = 2 * (x$loglik - ll_fe_only)
df = length(x$coefficients) p = pchisq(stat, df, lower.tail = FALSE) vec = list(stat = stat, p = p, df = df) res_all[[type]] = set_value(vec, value) } else { res_all[[type]] = NA } } else if(root %in% user_types){ res_all[[type]] = set_value(opts[[root]]$fun(x), value)

} else {
stop("The type '", type, "' is not supported, see details of ?fitstat, or use fitstat(show_types = TRUE) to get the names of all supported tests.")
}
}

}

if(simplify){
verbose = FALSE
}

if(verbose){
class(res_all) = "fixest_fitstat"
}

if(length(type_all) == 1 && simplify){
return(res_all[[1]])
}

res_all
}

#' Wald test of nullity of coefficients
#'
#' Wald test used to test the joint nullity of a set of coefficients.
#'
#' @inheritParams print.fixest
#' @inheritParams summary.fixest
#' @inheritParams etable
#'
#' @param print Logical, default is TRUE. If TRUE, then a verbose description of the test is prompted on the R console. Otherwise only a named vector containing the test statistics is returned.
#' @param ... Any other element to be passed to [summary.fixest].
#'
#' @details
#' The type of VCOV matrix plays a crucial role in this test. Use the arguments se and cluster to change the type of VCOV for the test.
#'
#' @return
#' A named vector containing the following elements is returned: stat, p, df1, and df2. They correspond to the test statistic, the p-value, the first and second degrees of freedoms.
#'
#' If no valid coefficient is found, the value NA is returned.
#'
#' @examples
#'
#' data(airquality)
#'
#' est = feols(Ozone ~ Solar.R + Wind + poly(Temp, 3), airquality)
#'
#' # Testing the joint nullity of the Temp polynomial
#' wald(est, "poly")
#'
#' # Same but with clustered SEs
#' wald(est, "poly", cluster = "Month")
#'
#' # Now: all vars but the polynomial and the intercept
#' wald(est, drop = "Inte|poly")
#'
#' #
#' # Toy example: testing pre-trends
#' #
#'
#' data(base_did)
#'
#' est_did = feols(y ~ x1 + i(period, treat, 5) | id + period, base_did)
#'
#' # The graph of the coefficients
#' coefplot(est_did)
#'
#' # The pre-trend test
#' wald(est_did, "period::[1234]$") #' #' # If "period::[1234]$" looks weird to you, check out
#' # regular expressions: e.g. see ?regex.
#' # Learn it, you won't regret it!
#'
#'
wald = function(x, keep = NULL, drop = NULL, print = TRUE, vcov, se, cluster, ...){
# LATER:
# - keep can be a list
#   * list("fit_" = 1, "x5$") # * regex = restriction. No "=" => 0 check_arg(x, "class(fixest)") check_arg(keep, drop, "NULL character vector no na") if(isTRUE(x$onlyFixef)) return(NA)

dots = list(...)
if(!isTRUE(x$summary)|| !missing(vcov) || !missing(se) || !missing(cluster) || ...length() > 0){ x = summary(x, vcov = vcov, se = se, cluster = cluster, ...) } if(missing(keep) && missing(drop)){ drop = "\$$Intercept\$$" } coef_name = names(x$coefficients)
coef_name = keep_apply(coef_name, keep)
coef_name = drop_apply(coef_name, drop)

if(length(coef_name) == 0) return(NA)

qui = names(x$coefficients) %in% coef_name my_coef = x$coefficients[qui]
df1 = length(my_coef)

df2 = x$nobs - x$nparams

# The VCOV is always full rank in here
stat = drop(my_coef %*% solve(x$cov.scaled[qui, qui]) %*% my_coef) / df1 p = pf(stat, df1, df2, lower.tail = FALSE) vcov = attr(x$cov.scaled, "type")
vec = list(stat = stat, p = p, df1 = df1, df2 = df2, vcov = vcov)

if(print){
cat("Wald test, H0: ", ifsingle(coef_name, "", "joint "), "nullity of ", enumerate_items(coef_name), "\n", sep  ="")
cat(" stat = ", numberFormatNormal(stat),
", p-value ", ifelse(p < 2.2e-16, "< 2.2e-16", paste0("= ", numberFormatNormal(p))),
", on ", numberFormatNormal(df1), " and ", numberFormatNormal(df2), " DoF, ",
"VCOV: ", vcov, ".", sep = "")

return(invisible(vec))
} else {
return(vec)
}
}

fitstat_validate = function(x, vector = FALSE){
check_value(x, "NA | os formula | charin(FALSE) | character vector no na", .arg_name = "fitstat", .up = 1)

if("formula" %in% class(x)){
x = attr(terms(update(~ DEFAULT, x)), "term.labels")
} else if (length(x) == 1 && (isFALSE(x) || is.na(x))){
x = c()
}

# checking the types
fitstat_fun_types = fitstat(give_types = TRUE)
fitstat_type_allowed = fitstat_fun_types$types x = unique(x) type_alias = fitstat_fun_types$type_alias

if(any(x %in% names(type_alias))){
i = intersect(x, names(type_alias))
x[x %in% i] = type_alias[x[x %in% i]]
}

pblm = setdiff(x, c(fitstat_type_allowed, "DEFAULT"))
if(length(pblm) > 0){
stop_up("In fitstat, argument 'x' must be a one sided formula (or a character vector) containing valid types from the function fitstat (see details in ?fitstat or use fitstat(show_types = TRUE)). The type", enumerate_items(pblm, "s.is.quote"), " not valid.")
}

if(length(x) == 0){
x = NA
} else if(vector){
x = gsub("DEFAULT", ".", x, fixed = TRUE)
} else {
x = as.formula(paste("~", paste(gsub("DEFAULT", ".", x), collapse = "+")))
}

x
}

#' R2s of fixest models
#'
#' Reports different R2s for fixest estimations (e.g. [feglm] or [feols]).
#'
#' @param x A fixest object, e.g. obtained with function [feglm] or [feols].
#' @param type A character vector representing the R2 to compute. The R2 codes are of the form: "wapr2" with letters "w" (within), "a" (adjusted) and "p" (pseudo) possibly missing. E.g. to get the regular R2: use type = "r2", the within adjusted R2: use type = "war2", the pseudo R2: use type = "pr2", etc. Use "cor2" for the squared correlation. By default, all R2s are computed.
#' @param full_names Logical scalar, default is FALSE. If TRUE then names of the vector in output will have full names instead of keywords (e.g. Squared Correlation instead of cor2, etc).
#'
#' @details
#'
#' The pseudo R2s are the McFaddens R2s, that is the ratio of log-likelihoods.
#'
#' For R2s with no theoretical justification, like e.g. regular R2s for maximum likelihood models -- or within R2s for models without fixed-effects, NA is returned. The single measure to possibly compare all kinds of models is the squared correlation between the dependent variable and the expected predictor.
#'
#' The pseudo-R2 is also returned in the OLS case, it corresponds to the pseudo-R2 of the equivalent GLM model with a Gaussian family.
#'
#' For the adjusted within-R2s, the adjustment factor is (n - nb_fe) / (n - nb_fe - K) with n the number of observations, nb_fe the number of fixed-effects and K the number of variables.
#'
#' @return
#' Returns a named vector.
#'
#' @author
#' Laurent Berge
#'
#' @examples
#'
#'
#' # We estimate the effect of distance on trade (with 3 fixed-effects)
#' est = feols(log(Euros) ~ log(dist_km)|Origin+Destination+Product, trade)
#'
#' # Squared correlation:
#' r2(est, "cor2")
#'
#' # "regular" r2:
#' r2(est, "r2")
#'
#' # pseudo r2 (equivalent to GLM with Gaussian family)
#' r2(est, "pr2")
#'
#' # adjusted within r2
#' r2(est, "war2")
#'
#' # all four at once
#' r2(est, c("cor2", "r2", "pr2", "war2"))
#'
#' # same with full names instead of codes
#' r2(est, c("cor2", "r2", "pr2", "war2"), full_names = TRUE)
#'
r2 = function(x, type = "all", full_names = FALSE){
# p: pseudo
# w: within

check_arg(full_names, "logical scalar")

if(!"fixest" %in% class(x)){
stop("Only 'fixest' objects are supported.")
}

check_arg(type, "character vector no na", .message = "Argument 'type' must be a character vector (e.g. type = c(\"cor2\", \"r2\", \"pr2\")). (a: adjused, p: pseudo, w: within.)")

# type_allowed next => ("count", "acount") ?
dict_names = c("cor2" = "Squared Correlation", "r2" = "R2", "ar2" = "Adjusted R2", "pr2" = "Pseudo R2", "apr2" = "Adjusted Pseudo R2", "wr2" = "Within R2", "war2" = "Adjusted Within R2", "wpr2" = "Within Pseudo R2", "wapr2" = "Adjusted Within Pseudo R2")
type_allowed = c("cor2", "r2", "ar2", "pr2", "apr2", "wr2", "war2", "wpr2", "wapr2")
types_alias = c(par2 = "apr2", awr2 = "war2", pwr2 = "wpr2", wpar2 = "wapr2", pwar2 = "wapr2", pawr2 = "wapr2", apwr2 = "wapr2", awpr2 = "wapr2", sq.cor = "cor2")
if("all" %in% type){
type_all = type_allowed
} else {
type_all = tolower(type)
pblm = setdiff(type_all, c(type_allowed, names(types_alias)))
if(length(pblm) > 0){
stop("The r2 type", enumerate_items(pblm, "quote.s.is"), " not valid.")
}
}

type_all = dict_apply(type_all, types_alias)

is_ols = x$method_type == "feols" isFixef = "fixef_vars" %in% names(x) n = nobs(x) res = rep(NA, length(type_all)) for(i in seq_along(type_all)){ myType = type_all[i] if(myType == "cor2"){ res[i] = x$sq.cor
next
}

if(!grepl("p", myType) && !is_ols){
# non pseudo R2 not valid for non ols
next
}

if(grepl("w", myType) && !isFixef){
# within R2 not valid for models without FE
next
}

adj = grepl("a", myType)
pseudo = grepl("p", myType)
within = grepl("w", myType) && isFixef
ifNullNA = function(x) ifelse(is.null(x), NA, x)
if(within && isFixef){

if(isTRUE(x$onlyFixef)) next if(is.null(x$ssr_fe_only) && !is.null(x$fixef_vars)){ # This is the case of feglm where there were no fe_only model estimated # => we need to compute the FE model first # 2019-11-26: now self contained call (no need for outer frame evaluation) if(isTRUE(x$lean)){
stop("Within R2s are not available for 'lean' fixest objects. Please reestimate with 'lean = FALSE'.")
}

# constructing the data
newdata = cbind(data.frame(y = x$y), as.data.frame(x$fixef_id))
if(!is.null(x$fixef_terms)){ newdata = cbind(newdata, as.data.frame(x$slope_variables))
}
# Fe/slope only formula
new_fml = merge_fml(y ~ 1, x$fml_all$fixef)

# x$family$family is also normal
res_fe = feglm(fml = new_fml, data = newdata, glm.tol = 1e-2, fixef.tol = 1e-3, family = x$family$family, weights = x$weights, offset = x$offset)

x$ssr_fe_only = cpp_ssq(res_fe$residuals)
x$ll_fe_only = logLik(res_fe) } # within df_k = ifelse(adj, length(coef(x)), 0) if(pseudo){ ll_fe_only = ifNullNA(x$ll_fe_only)
ll = logLik(x)
res[i] = 1 - (ll - df_k) / ll_fe_only
} else {
ssr_fe_only = ifNullNA(x$ssr_fe_only) nb_fe = x$nparams - length(coef(x))
res[i] = 1 - x$ssr / ssr_fe_only * (n - nb_fe) / (n - nb_fe - df_k) } } else { if(x$nparams == 1 && any(grepl("(Intercept)", names(x$coefficients), fixed = TRUE))){ res[i] = NA } else { df_k = ifelse(adj, x$nparams, 1 - pseudo)
if(pseudo){
ll_null = x$ll_null ll = logLik(x) if(adj){ res[i] = 1 - (ll - x$nparams + 1) / ll_null
} else {
res[i] = 1 - ll / ll_null
}
} else {
ssr_null = x$ssr_null df.intercept = 1 * (isFixef || any(grepl("(Intercept)", names(x$coefficients), fixed = TRUE)))
res[i] = 1 - x$ssr / ssr_null * (n - df.intercept) / (n - df_k) } } } } if(full_names){ names(res) = dict_apply(type_all, dict_names) } else { names(res) = type_all } res } #' Gets the degrees of freedom of a fixest estimation #' #' Simple utility to extract the degrees of freedom from a fixest estimation. #' #' @inheritParams vcov.fixest #' #' @param x A fixest estimation. #' @param type Character scalar, equal to "k", "resid", "t". If "k", then the number of regressors is returned. If "resid", then it is the "residuals degree of freedom", i.e. the number of observations minus the number of regressors. If "t", it is the degrees of freedom used in the t-test. Note that these values are affected by how the VCOV of x is computed, in particular when the VCOV is clustered. #' @param vars A vector of variable names, of the regressors. This is optional. If provided, then type is set to 1 by default and the number of regressors contained in vars is returned. This is only useful in the presence of collinearity and we want a subset of the regressors only. (Mostly for internal use.) #' @param stage Either 1 or 2. Only concerns IV regressions, which stage to look at. #' #' The type of VCOV can have an influence on the degrees of freedom. In particular, when the VCOV is clustered, the DoF returned will be in accordance with the way the small sample correction was performed when computing the VCOV. That type of value is in general not what we have in mind when we think of "degrees of freedom". To obtain the ones that are more intuitive, please use degrees_freedom_iid instead. #' #' #' @examples #' #' # First: an estimation #' #' base = iris #' names(base) = c("y", "x1", "x2", "x3", "species") #' est = feols(y ~ x1 + x2 | species, base) #' #' # "Normal" standard-errors (SE) #' est_standard = summary(est, se = "st") #' #' # Clustered SEs #' est_clustered = summary(est, se = "clu") #' #' # The different degrees of freedom #' #' # => different type 1 DoF (because of the clustering) #' degrees_freedom(est_standard, type = "k") #' degrees_freedom(est_clustered, type = "k") # fixed-effects are excluded #' #' # => different type 2 DoF (because of the clustering) #' degrees_freedom(est_standard, type = "resid") # => equivalent to the df.residual from lm #' degrees_freedom(est_clustered, type = "resid") #' #' #' degrees_freedom = function(x, type, vars = NULL, vcov = NULL, se = NULL, cluster = NULL, ssc = NULL, stage = 2){ check_arg(x, "class(fixest) mbt") check_arg_plus(type, "match(k, resid, t)") check_arg(stage, "integer scalar GE{1} LE{2}") check_arg(vars, "character vector no na") if(stage == 1 && isTRUE(x$iv) && x$iv_stage == 2){ x = x$iv_first_stage[[1]]
}

if(!missnull(vars)){
if(!missing(type) && type != "k"){
warning("The argument 'type' is ignored when the argument 'vars' is present. Type 'k' is returned.")
}

vars_keep = intersect(vars, names(x$coefficients)) return(length(vars_keep)) } if(missing(type)){ stop("The argument 'type' is required but is currently missing.") } if(!isTRUE(x$summary) || !missnull(vcov) || !missnull(se) || !missnull(cluster) || !missnull(ssc)){
x = summary(x, vcov = vcov, se = se, cluster = cluster, ssc = ssc)
}

vcov = x$cov.scaled if(is.null(vcov)){ dof.K = x$nparams
df.t = nobs(x) - dof.K
} else {
# From v0.10.0 onward, "dt.t" is always provided!
df.t = attr(vcov, "df.t")
dof.K = attr(vcov, "dof.K")
}

res = switch(type,
"k" = dof.K,
"resid" = max(nobs(x) - dof.K, 1),
"t" = df.t)

res
}

#' @describeIn degrees_freedom Gets the degrees of freedom of a fixest estimation
degrees_freedom_iid = function(x, type){
degrees_freedom(x, type, vcov = "iid")
}

####
#### Stats -- internal ####
####

.wald = function(x, var){
# x: fixest estimation

coef = x$coefficients vcov = x$cov.scaled
if(is.null(vcov)){
# => INTERNAL ERROR
stop("INTERNAL ERROR: .wald should be applied only to objects with a VCOV already computed. Could you report the error to the maintainer of fixest?")
}

var_keep = intersect(var, names(coef))

if(length(var_keep) == 0){
# All vars removed bc of collin => stat = 0
return(0)
}

# To handle errors (rare but can happen)
if(any(diag(vcov) < 1e-10)){
vcov = mat_posdef_fix(vcov)
}

chol_decomp = cpp_cholesky(vcov[var_keep, var_keep, drop = FALSE])
vcov_inv = chol_decomp$XtX_inv if(isTRUE(chol_decomp$all_removed)){
# Can happen for indicators + clustering at the same level
return(1e6)
}

if(any(chol_decomp$id_excl)){ var_keep = var_keep[!chol_decomp$id_excl]
}

my_coef = coef[var_keep]

stat = drop(my_coef %*% vcov_inv %*% my_coef) / length(my_coef)

stat
}

kp_stat = function(x){
# internal function => x must be a fixest object
#
# The code here is a translation of the ranktest.jl function from the Vcov.jl package
# from @matthieugomez (see https://github.com/matthieugomez/Vcov.jl)
#

if(!isTRUE(x$iv) || !x$iv_stage == 2) return(NA)

# Necessary data

X_proj = as.matrix(resid(summary(x, stage = 1)))

# while the projection of X has already been computed, we need to do it
# for Z => that's a pain in the neck
# computational cost is much lower is I include the KP computation at
# estimation time.... since here we do the job twice... we'll see

Z = model.matrix(x, type = "iv.inst")
Z_proj = proj_on_U(x, Z)

k = n_endo = ncol(X_proj)
l = n_inst = ncol(Z)

# We assume l >= k
q = min(k, l) - 1

# Let's go

if(n_endo == 1){
PI = t(coef(summary(x, stage = 1)))
} else {
PI = coef(summary(x, stage = 1))
}
PI = PI[, colnames(PI) %in% x$iv_inst_names_xpd, drop = FALSE] Fmat = chol(crossprod(Z_proj)) Gmat = chol(crossprod(X_proj)) theta = Fmat %*% t(solve(t(Gmat)) %*% PI) # theta: n_inst x n_endo if(n_inst == n_endo){ svd_decomp = svd(theta) u = svd_decomp$u
vt = t(svd_decomp$v) } else { # we need full decomp => un optimized decomp svd_decomp = mat_svd(theta) u = svd_decomp$u
vt = svd_decomp$vt } u_sub = u[k:l, k:l, drop = FALSE] vt_sub = vt[k, k] vt_k = vt[1:k, k, drop = FALSE] ssign = function(x) if(x == 0) 1 else sign(x) # There may be more sign problems here if(k == l){ a_qq = ssign(u_sub[1]) * u[1:l, k:l, drop = FALSE] b_qq = ssign(vt_sub[1]) * t(vt_k) } else { a_qq = u[1:l, k:l] %*% (solve(u_sub) %*% mat_sqrt(u_sub %*% t(u_sub))) b_qq = mat_sqrt(vt_sub %*% t(vt_sub)) %*% (solve(t(vt_sub)) %*% t(vt_k)) } # kronecker kronv = kronecker(b_qq, t(a_qq)) lambda = kronv %*% c(theta) # There is need to compute the vcov specifically for this case # We do it the same way as it was for x VCOV_TYPE = attr(x$cov.scaled, "type")

if(identical(VCOV_TYPE, "IID")){
vlab = chol(tcrossprod(kronv) / nrow(X_proj))

} else {
K = t(kronecker(Gmat, Fmat))

my_scores = do.call(cbind, lapply(1:ncol(X_proj), function(i) Z_proj * X_proj[, i]))

x_new = x
x_new$scores = my_scores vcov = x$summary_flags$vcov ssc = x$summary_flags$ssc meat = vcov(x_new, vcov = vcov, ssc = ssc, sandwich = FALSE) vhat = solve(K, t(solve(K, meat))) # DOF correction now n = nobs(x) - identical(VCOV_TYPE, "cluster") df_resid = degrees_freedom(x, "resid", stage = 1) vhat = vhat * n / df_resid vlab = kronv %*% vhat %*% t(kronv) } r_kp = t(lambda) %*% solve(vlab, lambda) # cat("KP r:\n") ; print(r_kp) # Now the results kp_df = n_inst - n_endo + 1 kp_f = r_kp / n_inst kp_p = pchisq(kp_f, kp_df, lower.tail = FALSE) list(stat = kp_f, p = kp_p, df = kp_df) } cd_stat = function(x){ # internal function # x: fixest object # if(!isTRUE(x$iv) || !x$iv_stage == 2) return(NA) # Necessary data X = model.matrix(x, type = "iv.endo") X_proj = proj_on_U(x, X) Z = model.matrix(x, type = "iv.inst") Z_proj = proj_on_U(x, Z) n = nobs(x) n_endo = NCOL(X_proj) n_inst = NCOL(Z_proj) # The stat if(FALSE){ # => just use the canonical correlation, it's faster df_resid = degrees_freedom(x, resid, stage = 1) V = resid(summary(x, stage = 1)) P_Z_proj = Z_proj %*% solve(crossprod(Z_proj)) %*% t(Z_proj) SIGMA_vv_hat = (t(X) %*% V) / df_resid inv_sqrt_SIGMA_vv_hat = solve(mat_sqrt(SIGMA_vv_hat)) G = (t(inv_sqrt_SIGMA_vv_hat) %*% t(X_proj) %*% P_Z_proj %*% X_proj %*% inv_sqrt_SIGMA_vv_hat) / n_inst cd = min(eigen(G)$values)

} else {
cc = min(cancor(X_proj, Z_proj)$cor) cd = ((n - n_endo - n_inst - 1) / n_inst) / ((1 - cc^2) / cc^2) } cd } #### #### Misc internal funs #### #### mat_sqrt = function(A){ e = eigen(A) # e$vectors %*% diag(x = sqrt(e$values), nrow = nrow(A)) %*% t(e$vectors)
ev = pmax(e$values, 1e-10) M = e$vectors %*% diag(x = ev**.25, nrow = nrow(A))
tcrossprod(M)
}

mat_svd = function(A){
# From https://rpubs.com/aaronsc32/singular-value-decomposition-r
# https://math.stackexchange.com/questions/2359992/how-to-resolve-the-sign-issue-in-a-svd-problem

ATA = crossprod(A)
ATA.e = eigen(ATA)
v = ATA.e$vectors AAT = tcrossprod(A) AAT.e = eigen(AAT) u = AAT.e$vectors
r = sqrt(ATA.e$values) # we want the last diag element of vt to be positive vt = t(v) k = nrow(vt) if(nrow(u) == k && vt[k, k] < 0){ vt[k, ] = - vt[k, ] u[, k] = - u[, k] } list(u = u, vt = vt, d = r) } proj_on_U = function(x, Z){ # Projects some variables (either the endo or the inst) on the # exogenous variables (U) # # x: a fixest IV estimation # # I write Z in the arguments, but it can be any matrix (like X) # U = model.matrix(x, type = "iv.exo") is_U = !is.null(U) if(!is.null(x$fixef_vars)){
# we need to center the variables
# Of course, if there are varying slopes, that's a pain in the neck

if(!is.null(x$slope_flag)){ Z_dm = demean(Z, x$fixef_id[order(x$fixef_sizes, decreasing = TRUE)], slope.vars = x$slope_variables_reordered,
slope.flag = x$slope_flag_reordered) if(is_U){ U_dm = demean(U, x$fixef_id[order(x$fixef_sizes, decreasing = TRUE)], slope.vars = x$slope_variables_reordered,
slope.flag = x$slope_flag_reordered) } } else { Z_dm = demean(Z, x$fixef_id)
if(is_U){
U_dm = demean(U, x\$fixef_id)
}
}

if(is_U){
Z_proj = resid(feols.fit(Z_dm, U_dm))
} else {
Z_proj = Z_dm
}

} else if(is_U){
Z_proj = resid(feols.fit(Z, U))
} else {
Z_proj = Z
}

Z_proj
}


## Try the fixest package in your browser

Any scripts or data that you put into this service are public.

fixest documentation built on Nov. 24, 2023, 5:11 p.m.