R/VCOV.R

Defines functions getFixest_vcov setFixest_vcov getFixest_ssc setFixest_ssc is_function_in_it cutoff_deduce gen_vcov_aliases check_set_cutoff prepare_sandwich slide_args catch_fun_args oldargs_to_vcov vcov_conley_internal vcov_driscoll_kraay_internal vcov_newey_west_internal vcov_cluster_internal vcov_hetero_internal vcov_iid_internal vcov_setup vcov_conley vcov_NW vcov_DK vcov_cluster dof ssc vcov.fixest

Documented in dof getFixest_ssc getFixest_vcov setFixest_ssc setFixest_vcov ssc vcov_cluster vcov_conley vcov_DK vcov.fixest vcov_NW

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Thu Apr 01 10:25:53 2021
# ~: VCOV related stuff
#----------------------------------------------#


####
#### User-level ####
####



#' Computes the variance/covariance of a `fixest` object
#'
#' This function extracts the variance-covariance of estimated parameters from a model 
#' estimated with [`femlm`], [`feols`] or [`feglm`].
#'
#' @inheritParams summary.fixest
#' @inheritParams nobs.fixest
#'
#' @param attr Logical, defaults to `FALSE`. Whether to include the attributes describing how 
#' the VCOV was computed.
#' @param ... Other arguments to be passed to [`summary.fixest`].
#'
#' The computation of the VCOV matrix is first done in [`summary.fixest`].
#'
#' @details
#' For an explanation on how the standard-errors are computed and what is the exact meaning of 
#' the arguments, please have a look at the dedicated vignette: 
#' [On standard-errors](https://lrberge.github.io/fixest/articles/standard_errors.html).
#'
#' @seealso
#' You can also compute VCOVs with the following functions: [`vcov_cluster`], 
#' [`vcov_hac`], [`vcov_conley`].
#'
#' @return
#' It returns a \eqn{K\times K} square matrix where \eqn{K} is the number of variables 
#' of the fitted model.
#' If `attr = TRUE`, this matrix has an attribute \dQuote{type} specifying how this 
#' variance/covariance matrix has been computed.
#'
#' @author
#' Laurent Berge
#'
#' @seealso
#' See also the main estimation functions [`femlm`], [`feols`] or [`feglm`]. 
#' [`summary.fixest`], [`confint.fixest`], [`resid.fixest`], [`predict.fixest`], [`fixef.fixest`].
#' 
#' @references
#'
#' Ding, Peng, 2021, "The Frisch–Waugh–Lovell theorem for standard errors." Statistics & Probability Letters 168. 
#'
#' @examples
#'
#' # Load panel data
#' data(base_did)
#'
#' # Simple estimation on a panel
#' est = feols(y ~ x1, base_did)
#'
#' # ======== #
#' # IID VCOV #
#' # ======== #
#'
#' # By default the VCOV assumes iid errors:
#' se(vcov(est))
#'
#' # You can make the call for an iid VCOV explicitly:
#' se(vcov(est, "iid"))
#'
#' #
#' # Heteroskedasticity-robust VCOV
#' #
#'
#' # By default the VCOV assumes iid errors:
#' se(vcov(est, "hetero"))
#'
#' # => note that it also accepts vcov = "White" and vcov = "HC1" as aliases.
#'
#' # =============== #
#' # Clustered VCOVs #
#' # =============== #
#'
#' # To cluster the VCOV, you can use a formula of the form cluster ~ var1 + var2 etc
#' # Let's cluster by the panel ID:
#' se(vcov(est, cluster ~ id))
#'
#' # Alternative ways:
#'
#' # -> cluster is implicitly assumed when a one-sided formula is provided
#' se(vcov(est, ~ id))
#'
#' # -> using the argument cluster instead of vcov
#' se(vcov(est, cluster = ~ id))
#'
#' # For two-/three- way clustering, just add more variables:
#' se(vcov(est, ~ id + period))
#'
#' # -------------------|
#' # Implicit deduction |
#' # -------------------|
#' # When the estimation contains FEs, the dimension on which to cluster
#' # is directly inferred from the FEs used in the estimation, so you don't need
#' # to explicitly add them.
#'
#' est_fe = feols(y ~ x1 | id + period, base_did)
#'
#' # Clustered along "id"
#' se(vcov(est_fe, "cluster"))
#'
#' # Clustered along "id" and "period"
#' se(vcov(est_fe, "twoway"))
#'
#'
#' # =========== #
#' # Panel VCOVs #
#' # =========== #
#'
#' # ---------------------|
#' # Newey West (NW) VCOV |
#' # ---------------------|
#' # To obtain NW VCOVs, use a formula of the form NW ~ id + period
#' se(vcov(est, NW ~ id + period))
#'
#' # If you want to change the lag:
#' se(vcov(est, NW(3) ~ id + period))
#'
#' # Alternative way:
#'
#' # -> using the vcov_NW function
#' se(vcov(est, vcov_NW(unit = "id", time = "period", lag = 3)))
#'
#' # -------------------------|
#' # Driscoll-Kraay (DK) VCOV |
#' # -------------------------|
#' # To obtain DK VCOVs, use a formula of the form DK ~ period
#'
#' se(vcov(est, DK ~ period))
#'
#' # If you want to change the lag:
#' se(vcov(est, DK(3) ~ period))
#'
#' # Alternative way:
#'
#' # -> using the vcov_DK function
#' se(vcov(est, vcov_DK(time = "period", lag = 3)))
#'
#' # -------------------|
#' # Implicit deduction |
#' # -------------------|
#' # When the estimation contains a panel identifier, you don't need
#' # to re-write them later on
#'
#' est_panel = feols(y ~ x1, base_did, panel.id = ~id + period)
#'
#' # Both methods, NM and DK, now work automatically
#' se(vcov(est_panel, "NW"))
#' se(vcov(est_panel, "DK"))
#'
#'
#' # =================================== #
#' # VCOVs robust to spatial correlation #
#' # =================================== #
#'
#' data(quakes)
#' est_geo = feols(depth ~ mag, quakes)
#'
#' # ------------|
#' # Conley VCOV |
#' # ------------|
#' # To obtain a Conley VCOV, use a formula of the form conley(cutoff) ~ lat + lon
#' # with lat/lon the latitude/longitude variable names in the data set
#' se(vcov(est_geo, conley(100) ~ lat + long))
#'
#' # Alternative way:
#'
#' # -> using the vcov_DK function
#' se(vcov(est_geo, vcov_conley(lat = "lat", lon = "long", cutoff = 100)))
#'
#' # -------------------|
#' # Implicit deduction |
#' # -------------------|
#' # By default the latitude and longitude are directly fetched in the data based
#' # on pattern matching. So you don't have to specify them.
#' # Furhter, an automatic cutoff is deduced by default.
#'
#' # The following works:
#' se(vcov(est_geo, "conley"))
#'
#'
#' # ======================== #
#' # Small Sample Corrections #
#' # ======================== #
#'
#' # You can change the way the small sample corrections are done with the argument ssc.
#' # The argument ssc must be created by the ssc function
#' se(vcov(est, ssc = ssc(adj = FALSE)))
#'
#' # You can add directly the call to ssc in the vcov formula.
#' # You need to add it like a variable:
#' se(vcov(est, iid ~ ssc(adj = FALSE)))
#' se(vcov(est, DK ~ period + ssc(adj = FALSE)))
#'
#'
#'
vcov.fixest = function(object, vcov = NULL, se = NULL, cluster, ssc = NULL, attr = FALSE, 
                       forceCovariance = FALSE, keepBounded = FALSE, 
                       nthreads = getFixest_nthreads(), vcov_fix = TRUE, ...){
  # computes the clustered vcov

  check_arg(attr, "logical scalar")
  is_attr = attr

  dots = list(...)

  # START :: SECTION used only internally in fixest_env
  only_varnames = isTRUE(dots$only_varnames)
  data_names = dots$data_names
  if(only_varnames){
    # Used internally in fixest_env to find out which variable to keep
    # => we need panel.id, so we can remove the NAs from it if it is implicitly deduced to be used
    # => idem for fixef_vars
    object = list(panel.id = dots$panel.id, fixef_vars = dots$fixef_vars)
  }
  #   END


  if(isTRUE(object$NA_model)){
    # means that the estimation is done without any valid variable
    return(object$cov.scaled)
  }

  if("dof" %in% names(dots)){
    if(is.null(getOption("fixest_warn_dof_arg"))){
      warning("The argument 'dof' is deprecated. Please use 'ssc' instead.")
      options(fixest_warn_dof_arg = TRUE)
    }
    ssc = dots$dof
  }

  if(is_user_level_call()){
    if(!is_function_in_it(vcov)){
      validate_dots(suggest_args = c("vcov", "ssc"), valid_args = "dof")
    }
  }

  # All the available VCOVs
  all_vcov = getOption("fixest_vcov_builtin")
  all_vcov_names = unlist(lapply(all_vcov, `[[`, "name"))
  all_vcov_names = all_vcov_names[nchar(all_vcov_names) > 0]

  # We transform se and cluster into vcov
  vcov_vars = var_values_all = var_names_all = NULL
  vcov = oldargs_to_vcov(se, cluster, vcov)

  sandwich = !isFALSE(dots$sandwich) # I know it's weird

  if(!is.null(object$onlyFixef)){
    # means that the estimation is done without variables
    return(NULL)
  }

  # If it's a summary => we give the vcov directly without further computation! except if arguments are provided which would mean that the user wants the new vcov
  if(isTRUE(object$summary) && missnull(vcov) && missnull(ssc)){
    vcov = object$cov.scaled
    if(!is_attr) {
      all_attr = names(attributes(vcov))
      for(v in setdiff(all_attr, c("dim", "dimnames"))){
        attr(vcov, v) = NULL
      }
    }
    return(vcov)
  }
  
  # we inherit the vcov/ssc from the summary
  assign_flags(object$summary_flags, vcov = vcov, ssc = ssc)

  # Default behavior vcov:
  suffix = ""
  if(missnull(vcov)){

    vcov_default = getFixest_vcov()

    if(!is.null(object$panel.id)){
      # Panel has precedence over FEs
      vcov = vcov_default$panel

    } else {
      n_fe = length(object$fixef_id)

      if(n_fe == 0){
        vcov = vcov_default$no_FE
      } else if(n_fe == 1){
        vcov = vcov_default$one_FE
      } else {
        vcov = vcov_default$two_FE
      }

      if(!is.null(object$fixef_sizes) && object$fixef_sizes[1] == 1){
        # Special case => cleaner output
        vcov = vcov_default$no_FE
      }
    }

  }


  if(isTRUE(object$lean)){
    # we can't compute the SE because scores are gone!
    # LATER: recompute the scores (costly but maybe only possibility for user?)
    #
    # so only IID VCOV is valid

    ok = FALSE
    if(is.character(vcov) && length(vcov) == 1){
      vcov_id = which(sapply(all_vcov, function(x) vcov %in% x$name))

      if(length(vcov_id) == 1){
        vcov_select = all_vcov[[vcov_id]]
        ok = identical(vcov_select$vcov_label, "IID")
      }
    }

    if(!ok) stop("VCOV of 'lean' fixest objects cannot be computed. Please re-estimate with 'lean = FALSE'.")
  }

  ####
  #### ... vcov parsing ####
  ####

  # Checking the value of vcov
  check_set_arg(vcov, "match | formula | function | matrix | list len(1) | class(fixest_vcov_request)", .choices = all_vcov_names)

  user_vcov_name = NULL
  if(is.list(vcov) && !inherits(vcov, "fixest_vcov_request")){
    # We already ensured it was a list of length 1
    user_vcov_name = names(vcov)
    vcov = vcov[[1]]
  }

  if(is.function(vcov)){

    if(only_varnames){
      return(character(0))
    }

    # cleaning dots
    for(var in c("summary_flags")){
      dots[[var]] = NULL
    }

    # The name
    arg_list = dots
    if(".vcov_args" %in% names(dots)){
      # internal call
      vcov_name = dots$vcov_name
      arg_list = dots$.vcov_args
    } else {
      vcov_name = attr(vcov, "deparsed_arg")
      if(is.null(vcov_name)){
        vcov_name = fetch_arg_deparse("vcov")
      } else {
        # cleaning
        attr(vcov, "deparsed_arg") = NULL
      }

      # Getting the right arguments
      arg_list = catch_fun_args(vcov, arg_list, exclude_args = "vcov", keep_dots = TRUE)
    }

    if(!is.null(user_vcov_name)){
      vcov_name = user_vcov_name

    } else {
      vcov_name = gsub("sandwich::", "", vcov_name, fixed = TRUE)
      vcov_name = gsub("^function\\([^\\)]+\\) ", "", vcov_name)
      vcov_name = gsub("^vcov$", "Custom", vcov_name)
    }

    # We shouldn't have a prior on the name of the first argument
    arg_names = formalArgs(vcov)
    arg_list[[arg_names[1]]] = object

    vcov = do.call(vcov, arg_list)

    n_coef = length(object$coefficients)
    check_value(vcov, "square numeric matrix nrow(value)", .value = n_coef,
          .message = paste0("If argument 'vcov' is to be a function, it should return a square numeric matrix of the same dimension as the number of coefficients (here ", n_coef, ")."))

    # We add the type of the matrix
    attr(vcov, "type") = vcov_name
    attr(vcov, "dof.K") = object$nparams

    return(vcov)
  }

  if(is.matrix(vcov)){

    if(only_varnames){
      stop("A custom VCOV matrix cannot be used directly in a fixest estimation.")
    }

    # Check that this makes sense
    n_coef = length(object$coefficients)
    check_value(vcov, "square matrix nrow(value)", .value = n_coef)

    attr(vcov, "type") = if(is.null(user_vcov_name)) "Custom" else user_vcov_name

    attr(vcov, "dof.K") = object$nparams

    return(vcov)
  }

  extra_args = NULL
  if(inherits(vcov, "fixest_vcov_request")){
    if(!is.null(vcov$ssc)) ssc = vcov$ssc
    var_names_all = vcov$var_names_all
    var_values_all = vcov$vcov_vars
    extra_args = vcov$extra_args
    vcov = vcov$vcov

  }

  if(inherits(vcov, "formula")){

    vcov_fml = vcov

    if(length(vcov_fml) == 2){
      vcov = ""
      vcov_vars = fml2varnames(vcov_fml, combine_fun = TRUE)
    } else {
      vcov = deparse_long(vcov_fml[[2]])

      is_extra = grepl("(", vcov, fixed = TRUE)
      if(is_extra){
        vcov = trimws(gsub("\\(.+", "", vcov))
      }

      check_set_arg(vcov, "match", 
                    .choices = all_vcov_names,
                    .message = "If a formula, the arg. 'vcov' must be of the form 'vcov_type ~ vars'. The vcov_type must be a supported VCOV type.")

      if(is_extra){
        new_req = eval(vcov_fml[[2]], environment(vcov_fml))
        extra_args = new_req$extra_args
      }

      vcov_vars = fml2varnames(vcov_fml[c(1, 3)], combine_fun = TRUE)
    }

    qui_ssc = grepl("ssc(", vcov_vars, fixed = TRUE)
    if(any(qui_ssc)){
      ssc_txt = vcov_vars[qui_ssc]
      ssc = eval(str2lang(ssc_txt))
      vcov_vars = vcov_vars[!qui_ssc]
    }

  }

  # Here vcov **must** be a character scalar

  vcov_id = which(sapply(all_vcov, function(x) vcov %in% x$name))

  if(length(vcov_id) != 1){
    stop("Unexpected problem in the selection of the VCOV. This is an internal error. Could you report?")
  }

  vcov_select = all_vcov[[vcov_id]]

  if(length(vcov_select$vars) == 0){
    var_names_all = character(0)

    if(only_varnames){
      return(var_names_all)
    }
  } else {

    if(!is.null(var_values_all)){
      # If we're here, this means that vcov_vars
      # has been provided via a vcov_request:
      #   - either via retro-compatibility (use of "cluster" with a vector or list)
      #   - either via feeding vectors into the user level version of the VCOV requested
      #
      # => so we don't have to do anything, checking will come later and is common with
      # the else{} of this condition

      is_int_all = list()

      if(only_varnames){
        return(character(0))
      }

      # We trim the observations if needed
      for(i in seq_along(var_values_all)){
        value = var_values_all[[i]]
        if(length(value) != object$nobs_origin){
          stopi("To compute the {vcov_select$vcov_label} VCOV, you need to provide variables with the same number of observations as in the original data set. Currently there are {len ? value} instead of {n ? object$nobs_origin}.")
        }

        var_values_all[[i]] = trim_obs_removed(value, object)
      }


    } else {

      patterns_split = strsplit(vcov_select$patterns, " ?\\+ ?")
      n_patterns = lengths(patterns_split)

      if(isTRUE(object$is_fit)){
        # No automatic deduction for fit methods,except for clustered SEs
        n_FE = length(object$fixef_vars)
        if(vcov_select$vcov_label %in% "Clustered"){
          if(n_FE == 0){
            stop("To compute a clustered VCOV from a '.fit' estimation, you need to provide the variables over which to cluster with the function vcov_cluster(). E.g. vcov = vcov_cluster(cluster = df) with df a data.frame containing the clusters. Or make a regular estimation, i.e. not from '.fit'.")
          }
          if(!n_FE %in% n_patterns){
            stop("To compute a clustered VCOV from a '.fit' estimation, you need to provide the variables over which to cluster with the function vcov = vcov_cluster(cluster = df), with df a data.frmae containing the clusters. Or make a regular estimation, i.e. not from '.fit'.")
          }
        } else {
          stop("The estimations from '.fit' methods don't support ", vcov_select$vcov_label, " VCOVs. To use them, perform a regular estimation instead.")
        }
      }

      if(only_varnames == FALSE){
        data = fetch_data(object)
        data_names = names(data)
      }


      if(!length(vcov_vars) %in% n_patterns){
        fml_display = paste0("~ ", vcov_select$patterns[n_patterns != 0])
        stopi("In the argument 'vcov', the number of variables in the RHS of the formula ({len ? vcov_vars}) is not valid. The formula should correspond to{&n_patterns>1; one of} {enum.bq.or ? fml_display}.")
      }


      var_values_all = list()
      # => list of variables used to compute the VCOV
      var_names_all = c()
      # => same as var_values_all but the variable names

      is_int_all = list()
      # => flag to avoid reapplying to_integer

      pattern = patterns_split[[which(n_patterns == length(vcov_vars))]]

      # We find all the variable names and then evaluate them
      for(i in seq_along(vcov_select$vars)){
        vcov_var_name = names(vcov_select$vars)[i]
        vcov_var_value = vcov_select$vars[[i]]

        if(vcov_var_name %in% pattern){
          # => provided by the user, we find which one it corresponds to
          # based on the pattern

          vname = vcov_vars[which(pattern == vcov_var_name)]

          vname_all = all.vars(str2expression(vname))
          if(!all(vname_all %in% data_names)){
            pblm = setdiff(vname_all, data_names)
            stopi("The variable{$s, enum.q ? pblm}, used to compute the VCOV, {$are} not in the original data set. Only variables in the data set can be used.")
          }

          var_names_all[vcov_var_name] = rename_hat(vname)
          if(only_varnames) {
            var_names_all[vcov_var_name] = vname_all[1]
            # To handle combined clusters
            var_names_all = c(var_names_all, vname_all[-1])
            next
          }

          value = eval(str2expression(vname), data)
          var_values_all[[vcov_var_name]] = trim_obs_removed(value, object)

        } else {

          ####
          #### ... --- guessing the variables ####
          ####

          # not provided by the user: GUESSED!
          # 3 types of guesses:
          # - fixef (fixed-effects used in the estimation)
          # - panel.id (panel identifiers used in the estimation)
          # - regex (based on variable names)
          #


          guesses = vcov_var_value$guess_from
          # err_msg, vector of elements of the form c("expect" = "pblm")
          err_msg = c()
          for(k in seq_along(guesses)){
            type = names(guesses)[k]

            if(type == "fixef"){

              if(is.null(object$fixef_vars)){
                msg = setNames(nm = "the fixed-effects",
                               "there was no fixed-effect in the estimation")
                err_msg = c(err_msg, msg)
                next
              }

              if(length(object$fixef_vars) < guesses$fixef){
                msg = setNames(nm = sma("the {nth ? guesses$fixef} fixed-effect"),
                               sma("the estimation had only {Len ? object$fixef_vars} ",
                                   "fixed-effect{$s}"))
                err_msg = c(err_msg, msg)
                next
              }

              var_names_all[vcov_var_name] = object$fixef_vars[guesses$fixef]
              if(only_varnames) break

              is_int_all[[vcov_var_name]] = TRUE

              var_values_all[[vcov_var_name]] = object$fixef_id[[guesses$fixef]]
              break

            } else if(type == "panel.id"){

              if(is.null(object$panel.id)){
                msg = setNames(nm = "the 'panel.id' identifiers",
                               "no 'panel.id' was set in this estimation")
                err_msg = c(err_msg, msg)
                next
              }

              vname = object$panel.id[guesses$panel.id]

              vname_all = all.vars(str2expression(vname))
              if(!all(vname_all %in% data_names)){
                pblm = setdiff(vname_all, data_names)
                msg = setNames(nm = "the 'panel.id' identifiers",
                               sma("the variable{$s ? pblm} set in 'panel.id' {$are} not in the data set"))
                err_msg = c(err_msg, msg)
                next
              }

              # ATTENTION:
              # if the panel.id variable has NA values, and there are no lags used in the estimation
              # (which would trigger obs removal), then we cannot use the panel.id
              # for the VCOV reliably since the sample would differ from the sample used in the
              # estimation

              var_names_all[vcov_var_name] = vname
              if(only_varnames) break

              value = eval(str2expression(vname), data)
              var_values_all[[vcov_var_name]] = trim_obs_removed(value, object)
              break

            } else if(type == "regex"){

              ok = FALSE
              msg = NULL
              for(pat in guesses$regex){
                var_id = which(grepl(pat, data_names))

                if(length(var_id) == 0){
                  if(is.null(msg)){
                    msg = setNames(nm = "the variable names of the data set",
                                   paste0("no match was found for ", vcov_var_value$label))
                  }
                } else if(length(var_id) == 1){
                  ok = TRUE
                  vname = data_names[var_id]
                  break
                } else {
                  msg = setNames(nm = "the variable names of the data set",
                                 sma("several matches were found: {enum.i.q ? data_names[var_id]}"))
                }
              }

              if(ok){
                var_names_all[vcov_var_name] = vname
                if(only_varnames) break

                var_values_all[[vcov_var_name]] = trim_obs_removed(data[[vname]], object)
                break
              } else {
                err_msg = c(err_msg, msg)
              }
            }
          }

          if(is.null(var_names_all[vcov_var_name]) &&
             length(err_msg) > 0 && !isTRUE(vcov_var_value$optional)){

            if(length(err_msg) == 1){
              expect = names(err_msg)
              pblm = err_msg
            } else {
              expect = enumerate_items(names(err_msg), "enum a.or")
              pblm = enumerate_items(err_msg, "enum a")
            }

            stop("To compute the ", vcov_select$vcov_label, " VCOV, we need a variable for the ",
                 vcov_var_value$label, ". Since you didn't provide it in the formula, ", 
                 "we typically deduce it from ", expect, ". ", 
                 "PROBLEM: ", pblm, ". Please provide it in the formula.")
          }
        }
      }

      if(only_varnames){
        return(var_names_all)
      }
    }


    # Extra checking + putting into int if requested
    for(i in seq_along(var_values_all)){
      vcov_var_name = names(var_values_all)[i]
      vname = var_names_all[vcov_var_name]
      value = var_values_all[[i]]

      vcov_var_value = vcov_select$vars[[vcov_var_name]]

      if(!isTRUE(is_int_all[[vcov_var_name]]) && anyNA(value)){
        # First condition means: it is a fixed-effect used in the estimation => no need to check
        stop("The variable '", vname, "' used to estimate the VCOV (employed as ", vcov_var_value$label, ") has NA values which would lead to a sample used to compute the VCOV different from the sample used to estimate the parameters. This would lead to wrong inference.\nPossible solutions: i) ex ante prune them, ii) impute them, or iii) use the argument 'vcov' at estimation time.")
      }

      if(isTRUE(vcov_var_value$to_int)){
        # if to_int => we do not have to check the type since it can be applied to any type

        if(!isTRUE(is_int_all[[vcov_var_name]])){
          var_values_all[[vcov_var_name]] = quickUnclassFactor(value)
        }

      } else if(!is.null(vcov_var_value$expected_type)){
        check_value(value, .type = vcov_var_value$expected_type,
              .prefix = paste0("To compute the VCOV, the ", vcov_var_value$label, " (here '", vname, "') "))
      }
    }

    vcov_vars = var_values_all

  }


  # Checking the nber of threads
  if(!missing(nthreads)) nthreads = check_set_nthreads(nthreads)


  ####
  #### ... scores ####
  ####

  # We handle the bounded parameters:
  isBounded = object$isBounded
  if(is.null(isBounded)){
    isBounded = rep(FALSE, length(object$coefficients))
  }

  if(any(isBounded)){
    if(keepBounded){
      # we treat the bounded parameters as regular variables
      scores = object$scores
      object$cov.iid = solve(object$hessian)
    } else {
      scores = object$scores[, -which(isBounded), drop = FALSE]
    }
  } else {
    scores = object$scores
  }


  ####
  #### ... bread ####
  ####

  n = object$nobs

  if(object$method_type == "feols"){
    iid_names = all_vcov[[1]]$name
    if(vcov %in% iid_names){
      bread = object$cov.iid / ((n - 1) / (n - object$nparams))
    } else {
      bread = object$cov.iid / object$sigma2
    }
  } else {
    bread = object$cov.iid
  }

  if(anyNA(bread)){

    IS_NA_VCOV = FALSE

    if(!forceCovariance){
      IS_NA_VCOV = TRUE
    } else {
      info_inv = cpp_cholesky(object$hessian)
      if(!is.null(info_inv$all_removed)){
        # Means all variables are collinear! => can happen when using FEs
        IS_NA_VCOV = TRUE
      }
    }

    if(IS_NA_VCOV){
      attr(bread, "type") = "NA (not-available)"

      if(is_attr){
        attr(bread, "dof.K") = object$nparams
        attr(bread, "df.t") = NA
      }

      return(bread)
    }

    VCOV_raw_forced = info_inv$XtX_inv
    if(any(info_inv$id_excl)){
      n_collin = sum(info_inv$all_removed)
      mema("NOTE: {N ? n_collin} variable{#s, have} been found to be singular.")

      VCOV_raw_forced = cpp_mat_reconstruct(VCOV_raw_forced, info_inv$id_excl)
      VCOV_raw_forced[, info_inv$id_excl] = NA
      VCOV_raw_forced[info_inv$id_excl, ] = NA
    }

    bread = VCOV_raw_forced

  }

  ####
  #### ... vcov no adj ####
  ####

  # we compute the vcov. The adjustment (which is a pain in the neck) will come after that
  # Here vcov is ALWAYS a character scalar

  # ssc related => we accept NULL
  # we check ssc since it can be used by the funs
  if(missnull(ssc)) ssc = getFixest_ssc()
  check_arg(ssc, "class(ssc.type)", 
            .message = "The argument 'ssc' must be an object created by the function ssc().")

  fun_name = vcov_select$fun_name
  args = list(bread = bread, scores = scores, vars = vcov_vars, ssc = ssc,
              sandwich = sandwich, nthreads = nthreads,
              var_names_all = var_names_all)

  for(a in names(extra_args)){
    # I have to add this weird condition (because of the aliases)
    if(!is.null(extra_args[[a]])) args[[a]] = extra_args[[a]]
  }

  vcov_noAdj = do.call(fun_name, args)

  if(!sandwich){
    return(vcov_noAdj)
  }

  dimnames(vcov_noAdj) = dimnames(bread)

  ####
  #### ... ssc ####
  ####

  # ssc is a ssc object in here

  n_fe = n_fe_ok = length(object$fixef_id)

  # we adjust the fixef sizes to account for slopes
  fixef_sizes_ok = object$fixef_sizes
  isSlope = FALSE
  if(!is.null(object$fixef_terms)){
    isSlope = TRUE
    # The drop the fixef_sizes for only slopes
    fixef_sizes_ok[object$slope_flag < 0] = 0
    n_fe_ok = sum(fixef_sizes_ok > 0)
  }

  nested_vars = sapply(vcov_select$vars, function(x) isTRUE(x$rm_nested))
  any_nested_var = length(nested_vars) > 0 && length(vcov_vars) > 0 && any(nested_vars[names(vcov_vars)])

  if(ssc$fixef.K == "none"){
    # we do it with "minus" because of only slopes
    K = object$nparams
    if(n_fe_ok > 0){
      K = K - (sum(fixef_sizes_ok) - (n_fe_ok - 1))
    }
  } else if(ssc$fixef.K == "full" || !any_nested_var){
    K = object$nparams
    if(ssc$fixef.force_exact && n_fe >= 2 && n_fe_ok >= 1){
      fe = fixef(object, notes = FALSE)
      K = K + (n_fe_ok - 1) - sum(attr(fe, "references"))
    }
  } else {
    # nested
    # we delay the adjustment
    K = object$nparams
  }

  #
  # NESTING (== pain in the neck)
  #

  if(ssc$fixef.K == "nested" && n_fe_ok > 0 && any_nested_var){
    # OK, let's go checking....
    # We always try to minimize computation.
    # So we maximize deduction and apply computation only in last resort.

    nested_vcov_var_names = intersect(names(nested_vars[nested_vars]), names(vcov_vars))
    nested_var_names = var_names_all[nested_vcov_var_names]

    is_nested = rep(FALSE, n_fe)
    for(i in 1:n_fe){
      # We check for each FE
      fe_name = object$fixef_vars[i]

      if(fe_name %in% nested_var_names){
        is_nested[i] = TRUE
        next

      } else if(grepl("^", fe_name, fixed = TRUE)){
        # nesting of the FE in the parent term
        fe_name_split = strsplit(fe_name, "^", fixed = TRUE)[[1]]
        if(any(fe_name_split %in% nested_var_names)){
          is_nested[i] = TRUE
          next
        }
      }
    }


    # We check the remaining FEs, only if necessary
    if(!all(is_nested) && !all(nested_var_names %in% object$fixef_vars)){
      # Note that if all(nested_var_names %in% object$fixef_vars) == TRUE
      # Then all the variables used to compute the VCOV are part of the FEs
      # So the nesting would have been correctly spotted right before.
      # Only caveat: if the user has a^b in FE and includes the parent terms (a and b), we may forget some nested FEs
      # But then that's the user problem bc it's an erroneous specification

      vcov_vars_nesting = vcov_vars[nested_vcov_var_names]
      # We put the non integer to integer (needed)
      id_to_int = which(sapply(vcov_select$vars[nested_vcov_var_names], function(x) !isTRUE(x$to_int)))

      for(i in id_to_int){
        vcov_vars_nesting[[i]] = quickUnclassFactor(vcov_vars_nesting[[i]])
      }

      id2check = which(is_nested == FALSE)
      info = cpp_check_nested(object$fixef_id[id2check], vcov_vars_nesting, 
                              object$fixef_sizes[id2check], n = n)
      is_nested[id2check] = info == 1
    }


    if(sum(is_nested) == n_fe){
      # All FEs are removed, we add 1 for the intercept
      K = K - (sum(fixef_sizes_ok) - (n_fe_ok - 1)) + 1
    } else {
      if(ssc$fixef.force_exact && n_fe >= 2){
        fe = fixef(object, notes = FALSE)
        nb_ref = attr(fe, "references")

        # Slopes are a pain in the neck!!!
        if(sum(is_nested) > 1){
          id_nested = intersect(names(nb_ref), names(object$fixef_id)[is_nested])
          nb_ref[id_nested] = object$fixef_sizes[id_nested]
        }

        total_refs = sum(nb_ref)

        K = K - total_refs
      } else {
        K = K - (sum(fixef_sizes_ok[is_nested]) - sum(fixef_sizes_ok[is_nested] > 0))
      }
    }

    # below for consistency => should *not* be triggered
    K = max(K, length(object$coefficients) + 1)
  }

  # Small sample adjustment
  ss_adj = attr(vcov_noAdj, "ss_adj")
  if(!is.null(ss_adj)){
    if(is.function(ss_adj)){
      ss_adj = ss_adj(n = n, K = K)
    }
    attr(vcov_noAdj, "ss_adj") = NULL
  } else {
    ss_adj = ifelse(ssc$adj, (n - 1) / (n - K), 1)
  }

  vcov_mat = vcov_noAdj * ss_adj

  ####
  #### ... vcov attributes ####
  ####


  if(any(diag(vcov_mat) < 0) && vcov_fix){
    # We 'fix' it
    all_attr = attributes(vcov_mat)
    vcov_mat = mat_posdef_fix(vcov_mat)
    is_complex = isTRUE(attr(vcov_mat, "is_complex"))

    if (is_complex) {
      # we should never have a complex VCOV, but just in case...
      message("The VCOV matrix could not be fixed since its eigenvalues were complex. The complex standard-errors are reported for information purposes.")
      vcov_mat = as.complex(vcov_mat)
    } else {
      message("Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).")
    }

    attributes(vcov_mat) = all_attr

  }

  if(is_attr){

    min_cluster_size = attr(vcov_mat, "min_cluster_size")
    if(is.numeric(ssc$t.df)){
      attr(vcov_mat, "df.t") = ssc$t.df

    } else if(ssc$t.df == "min" && !is.null(min_cluster_size)){
      attr(vcov_mat, "df.t") = max(min_cluster_size - 1, 1)

    } else {
      attr(vcov_mat, "df.t") = max(n - K, 1)
    }

    if(is.null(attr(vcov_mat, "type", exact = TRUE))){
      type_info = attr(vcov_mat, "type_info")
      attr(vcov_mat, "type") = paste0(vcov_select$vcov_label, type_info)
      attr(vcov_mat, "type_info") = NULL
    }

    attr(vcov_mat, "ssc") = ssc
    attr(vcov_mat, "dof.K") = K
  } else {
    # We clean the attributes
    all_attr = names(attributes(vcov_mat))
    for(v in setdiff(all_attr, c("dim", "dimnames"))){
      attr(vcov_mat, v) = NULL
    }
  }

  vcov_mat
}




####
#### Small Sample Correction ####
####



#' Governs the small sample correction in `fixest` VCOVs
#'
#' Provides how the small sample correction should be calculated in [`vcov.fixest`]/[`summary.fixest`].
#'
#' @param adj Logical scalar, defaults to `TRUE`. Whether to apply a small sample adjustment of 
#' the form `(n - 1) / (n - K)`, with `K` the number of estimated parameters. If `FALSE`, then 
#' no adjustment is made.
#' @param fixef.K Character scalar equal to `"nested"` (default), `"none"` or `"full"`. In 
#' the small sample adjustment, how to account for the fixed-effects parameters. If `"none"`, 
#' the fixed-effects parameters are discarded, meaning the number of parameters (`K`) is only 
#' equal to the number of variables. If `"full"`, then the number of parameters is equal to 
#' the number of variables plus the number of fixed-effects. Finally, if `"nested"`, then 
#' the number of parameters is equal to the number of variables plus the number of 
#' fixed-effects that *are not* nested in the clusters used to cluster the standard-errors.
#' @param fixef.force_exact Logical, default is `FALSE`. If there are 2 or more fixed-effects, 
#' these fixed-effects they can be irregular, meaning they can provide the same information. 
#' If so, the "real" number of parameters should be lower than the total number of 
#' fixed-effects. If `fixef.force_exact = TRUE`, then [`fixef.fixest`] is first run to 
#' determine the exact number of parameters among the fixed-effects. Mostly, panels of 
#' the type individual-firm require `fixef.force_exact = TRUE` (but it adds computational costs).
#' @param cluster.adj Logical scalar, default is `TRUE`. How to make the small sample correction 
#' when clustering the standard-errors? If `TRUE` a `G/(G-1)` correction is performed with `G` 
#' the number of cluster values.
#' @param cluster.df Either "conventional" or "min" (default). Only relevant when the 
#' variance-covariance matrix is two-way clustered (or higher). It governs how the small 
#' sample adjustment for the clusters is to be performed. \[Sorry for the jargon that follows.\] 
#' By default a unique adjustment is made, of the form G_min/(G_min-1) with G_min the 
#' smallest G_i. If `cluster.df="conventional"` then the i-th "sandwich" matrix is adjusted 
#' with G_i/(G_i-1) with G_i the number of unique clusters.
#' @param t.df Either "conventional", "min" (default) or an integer scalar. Only relevant when 
#' the variance-covariance matrix is clustered. It governs how the p-values should be computed. 
#' By default, the degrees of freedom of the Student t distribution is equal to the minimum size 
#' of the clusters with which the VCOV has been clustered minus one. If `t.df="conventional"`, 
#' then the degrees of freedom of the Student t distribution is equal to the number of 
#' observations minus the number of estimated variables. You can also pass a number to 
#' manually specify the DoF of the t-distribution.
#'
#' @details
#'
#' The following vignette: [On standard-errors](https://lrberge.github.io/fixest/articles/standard_errors.html), 
#' describes in details how the standard-errors are computed in `fixest` and how you can 
#' replicate standard-errors from other software.
#'
#' @return
#' It returns a `ssc.type` object.
#'
#' @author
#' Laurent Berge
#'
#' @seealso
#' [`summary.fixest`], [`vcov.fixest`]
#'
#' @examples
#'
#' #
#' # Equivalence with lm/glm standard-errors
#' #
#'
#' # LM
#' # In the absence of fixed-effects,
#' # by default, the standard-errors are computed in the same way
#'
#' res = feols(Petal.Length ~ Petal.Width + Species, iris)
#' res_lm = lm(Petal.Length ~ Petal.Width + Species, iris)
#' vcov(res) / vcov(res_lm)
#'
#' # GLM
#' # By default, there is no small sample adjustment in glm, as opposed to feglm.
#' # To get the same SEs, we need to use ssc(adj = FALSE)
#'
#' res_pois = fepois(round(Petal.Length) ~ Petal.Width + Species, iris)
#' res_glm = glm(round(Petal.Length) ~ Petal.Width + Species, iris, family = poisson())
#' vcov(res_pois, ssc = ssc(adj = FALSE)) / vcov(res_glm)
#'
#' # Same example with the Gamma
#' res_gamma = feglm(round(Petal.Length) ~ Petal.Width + Species, iris, family = Gamma())
#' res_glm_gamma = glm(round(Petal.Length) ~ Petal.Width + Species, iris, family = Gamma())
#' vcov(res_gamma, ssc = ssc(adj = FALSE)) / vcov(res_glm_gamma)
#'
#' #
#' # Fixed-effects corrections
#' #
#'
#' # We create "irregular" FEs
#' base = data.frame(x = rnorm(10))
#' base$y = base$x + rnorm(10)
#' base$fe1 = rep(1:3, c(4, 3, 3))
#' base$fe2 = rep(1:5, each = 2)
#'
#' est = feols(y ~ x | fe1 + fe2, base)
#'
#' # fe1: 3 FEs
#' # fe2: 5 FEs
#'
#' #
#' # Clustered standard-errors: by fe1
#' #
#'
#' # Default: fixef.K = "nested"
#' #  => adjustment K = 1 + 5 (i.e. x + fe2)
#' summary(est)
#' attributes(vcov(est, attr = TRUE))[c("ssc", "dof.K")]
#'
#'
#' # fixef.K = FALSE
#' #  => adjustment K = 1 (i.e. only x)
#' summary(est, ssc = ssc(fixef.K = "none"))
#' attr(vcov(est, ssc = ssc(fixef.K = "none"), attr = TRUE), "dof.K")
#'
#'
#' # fixef.K = TRUE
#' #  => adjustment K = 1 + 3 + 5 - 1 (i.e. x + fe1 + fe2 - 1 restriction)
#' summary(est, ssc = ssc(fixef.K = "full"))
#' attr(vcov(est, ssc = ssc(fixef.K = "full"), attr = TRUE), "dof.K")
#'
#'
#' # fixef.K = TRUE & fixef.force_exact = TRUE
#' #  => adjustment K = 1 + 3 + 5 - 2 (i.e. x + fe1 + fe2 - 2 restrictions)
#' summary(est, ssc = ssc(fixef.K = "full", fixef.force_exact = TRUE))
#' attr(vcov(est, ssc = ssc(fixef.K = "full", fixef.force_exact = TRUE), attr = TRUE), "dof.K")
#'
#' # There are two restrictions:
#' attr(fixef(est), "references")
#'
#' #
#' # To permanently set the default ssc:
#' #
#'
#' # eg no small sample adjustment:
#' setFixest_ssc(ssc(adj = FALSE))
#'
#' # Factory default
#' setFixest_ssc()
#'
ssc = function(adj = TRUE, fixef.K = "nested", cluster.adj = TRUE, cluster.df = "min",
               t.df = "min", fixef.force_exact = FALSE){

  check_set_arg(adj, "loose logical scalar conv")
  check_set_arg(fixef.K, "match(none, full, nested)")
  check_set_arg(cluster.df, "match(conventional, min)")
  check_set_arg(t.df, "match(conventional, min) | integer scalar GT{0}")
  check_arg(fixef.force_exact, cluster.adj, "logical scalar")

  res = list(adj = adj, fixef.K = fixef.K, cluster.adj = cluster.adj, cluster.df = cluster.df,
             t.df = t.df, fixef.force_exact = fixef.force_exact)
  class(res) = "ssc.type"

  res
}

#' @describeIn ssc This function is deprecated and will be removed at some point (in 6 months from August 2021). Exactly the same as `ssc`.
dof = function(adj = TRUE, fixef.K = "nested", cluster.adj = TRUE, cluster.df = "min",
               t.df = "min", fixef.force_exact = FALSE){

  if(is.null(getOption("fixest_warn_dof"))){
    warning("The function 'dof' is deprecated. Please use function 'ssc' instead.")
    options(fixest_warn_dof = TRUE)
  }

  ssc(adj = adj, fixef.K = fixef.K, cluster.adj = cluster.adj, cluster.df = cluster.df,
      t.df = t.df, fixef.force_exact = fixef.force_exact)
}


####
#### User-level ####
####


#' Clustered VCOV
#'
#' Computes the clustered VCOV of `fixest` objects.
#' 
#' @inheritParams vcov.fixest
#'
#' @param x A `fixest` object.
#' @param cluster Either i) a character vector giving the names of the variables onto which to 
#' cluster, or ii) a formula giving those names, or iii) a vector/list/data.frame giving the hard 
#' values of the clusters. Note that in cases i) and ii) the variables are fetched directly in the 
#' data set used for the estimation.
#' @param ssc An object returned by the function [`ssc`]. It specifies how to perform the small 
#' sample correction.
#'
#' @return
#' If the first argument is a `fixest` object, then a VCOV is returned (i.e. a symmetric matrix).
#'
#' If the first argument is not a `fixest` object, then a) implicitly the arguments are shifted to 
#' the left (i.e. `vcov_cluster(~var1 + var2)` is equivalent to 
#' `vcov_cluster(cluster = ~var1 + var2)`) and b) a VCOV-*request* is returned and NOT a VCOV. 
#' That VCOV-request can then be used in the argument `vcov` of various `fixest` 
#' functions (e.g. [`vcov.fixest`] or even in the estimation calls).
#'
#' @author
#' Laurent Berge
#'
#' @references
#' Cameron AC, Gelbach JB, Miller DL (2011). "Robust Inference with Multiway Clustering." *Journal of Business & Economic Statistics*, 29(2), 238-249. doi:10.1198/jbes.2010.07136.
#'
#' @examples
#'
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#' base$clu = rep(1:5, 30)
#'
#' est = feols(y ~ x1, base)
#'
#' # VCOV: using a formula giving the name of the clusters
#' vcov_cluster(est, ~species + clu)
#'
#' # works as well with a character vector
#' vcov_cluster(est, c("species", "clu"))
#'
#' # you can also combine the two with '^'
#' vcov_cluster(est, ~species^clu)
#'
#' #
#' # Using VCOV requests
#' #
#'
#' # per se: pretty useless...
#' vcov_cluster(~species)
#'
#' # ...but VCOV-requests can be used at estimation time:
#' # it may be more explicit than...
#' feols(y ~ x1, base, vcov = vcov_cluster("species"))
#'
#' # ...the equivalent, built-in way:
#' feols(y ~ x1, base, vcov = ~species)
#'
#' # The argument vcov does not accept hard values,
#' # so you can feed them with a VCOV-request:
#' feols(y ~ x1, base, vcov = vcov_cluster(rep(1:5, 30)))
#'
#'
vcov_cluster = function(x, cluster = NULL, ssc = NULL, vcov_fix = TRUE){
  # User-level function to compute clustered SEs
  # typically we only do checking and reshaping here

  # slide_args allows the implicit allocation of arguments
  # it makes semi-global changes => the values of the args here are modified
  slide_args(x, cluster = cluster, ssc = ssc)
  IS_REQUEST = is.null(x)

  check_value(ssc, "NULL class(ssc.type)", .message = "The argument 'ssc' must be an object created by the function ssc().")


  # We create the request

  use_request = FALSE
  vcov_vars = var_names_all = NULL
  if(is.null(cluster)){
    vcov = "cluster"
  } else if(inherits(cluster, "formula")){
    vcov = cluster
  } else if(is.character(cluster) && length(cluster) <= 4){
    vcov = .xpd(lhs = "cluster", rhs = cluster)
  } else {
    use_request = TRUE
    vcov = "cluster"
    vcov_vars = cluster
  }

  # The main task is to check the validity of the vcov_vars in input
  # and give proper names
  if(!is.null(vcov_vars)){

    sc = sys.call()
    if("cluster" %in% names(sc)){
      cl_name = fetch_arg_deparse("cluster")
    } else {
      cl_name = deparse_long(sc[[2]])
    }

    if(is.atomic(vcov_vars)){
      vcov_vars = list(cl1 = vcov_vars)
      var_names_all["cl1"] = gsub("^.+\\$", "", cl_name)

    } else if(!is.list(vcov_vars) || length(vcov_vars) > 4){
      # We send an error + a reminder of the accepted types
      check_value(vcov_vars, "os formula | character vector | list len(,4)", .arg_name = "cluster")

    } else {
      vcov_var_names = paste0("cl", 1:length(vcov_vars))

      if(!is.null(names(vcov_vars))){
        var_names_all[vcov_var_names] = names(vcov_vars)

      } else {

        len = length(vcov_var_names)
        new_name = ifunit(len, cl_name, paste0(cl_name, "..", 1:len))

        var_names_all[vcov_var_names] = new_name
      }

      # We ensure it's a plain list
      vcov_vars = unclass(vcov_vars)
      names(vcov_vars) = vcov_var_names

      # Some further checking
      if(!is.atomic(vcov_vars[[1]])){
        stop("If the argument 'cluster' it to be a list, it must be made of atomic vectors representing the vectors on which to cluster. Currently its first element is of class '", class(vcov_vars[[1]])[1], "'.")
      }

    }

  }

  if(use_request || !is.null(ssc)){
    vcov_request = list(vcov = vcov, vcov_vars = vcov_vars,
              var_names_all = var_names_all, ssc = ssc)
    class(vcov_request) = "fixest_vcov_request"

  } else {
    # Everything can fit into a vcov formula
    vcov_request = vcov
  }

  if(IS_REQUEST){
    res = vcov_request
  } else {
    res = vcov(x, vcov = vcov_request, vcov_fix = vcov_fix)
  }

  res
}



#' HAC VCOVs
#'
#' Set of functions to compute the VCOVs robust to different forms correlation in panel or 
#' time series settings.
#'
#' There are currently three VCOV types: Newey-West applied to time series, Newey-West applied to 
#' a panel setting (when the argument 'unit' is not missing), and Driscoll-Kraay.
#'
#' The functions on this page without the prefix "vcov_" do not compute VCOVs directly but 
#' are meant to be used in the argument `vcov` of `fixest` functions (e.g. in [`vcov.fixest`] 
#' or even in the estimation calls).
#'
#' Note that for Driscoll-Kraay VCOVs, to ensure its properties the number of periods should 
#' be long enough (a minimum of 20 periods or so).
#'
#' @inheritParams vcov_cluster
#'
#' @param unit A character scalar or a one sided formula giving the name of the 
#' variable representing the units of the panel.
#' @param time A character scalar or a one sided formula giving the name of the 
#' variable representing the time.
#' @param lag An integer scalar, default is `NULL`. If `NULL`, then the default lag is equal to 
#' `n_t^0.25` with `n_t` the number of time periods (as of Newey and West 1987) for panel 
#' Newey-West and Driscoll-Kraay. The default for the time series Newey-West is computed via 
#' [`bwNeweyWest`][sandwich::NeweyWest] which implements the Newey and West 1994 method.
#'
#' @section Lag selection:
#'
#' The default lag selection depends on whether the VCOV applies to a panel or a time series.
#'
#' For panels, i.e. panel Newey-West or Driscoll-Kraay VCOV, the default lag is `n_t^0.25` with 
#' `n_t` the number of time periods. This is based on Newey and West 1987.
#'
#' For time series Newey-West, the default lag is found thanks to the 
#' [`bwNeweyWest`][sandwich::NeweyWest] function from the `sandwich` package. It is based on 
#' Newey and West 1994.
#'
#'
#' @references
#' Newey WK, West KD (1987). "A Simple, Positive Semi-Definite, Heteroskedasticity and 
#' Autocorrelation Consistent Covariance Matrix." *Econometrica*, 55(3), 703-708. doi:10.2307/1913610.
#'
#' Driscoll JC, Kraay AC (1998). "Consistent Covariance Matrix Estimation with Spatially Dependent 
#' Panel Data." *The Review of Economics and Statistics*, 80(4), 549-560. doi:10.1162/003465398557825.
#'
#' Millo G (2017). "Robust Standard Error Estimators for Panel Models: A Unifying Approach" 
#' *Journal of Statistical Software*, 82(3). doi:10.18637/jss.v082.i03.
#'
#' @return
#' If the first argument is a `fixest` object, then a VCOV is returned (i.e. a symmetric matrix).
#'
#' If the first argument is not a `fixest` object, then a) implicitly the arguments are shifted to 
#' the left (i.e. `vcov_DK(~year)` is equivalent to `vcov_DK(time = ~year)`) and b) a 
#' VCOV-*request* is returned and NOT a VCOV. That VCOV-request can then be used in the argument 
#' `vcov` of various `fixest` functions (e.g. [`vcov.fixest`] or even in the estimation calls).
#'
#' @examples
#'
#' data(base_did)
#'
#' #
#' # During the estimation
#' #
#'
#' # Panel Newey-West, lag = 2
#' feols(y ~ x1, base_did, NW(2) ~ id + period)
#'
#' # Driscoll-Kraay
#' feols(y ~ x1, base_did, DK ~ period)
#'
#' # If the estimation is made with a panel.id, the dimensions are
#' # automatically deduced:
#' est = feols(y ~ x1, base_did, "NW", panel.id = ~id + period)
#' est
#'
#' #
#' # Post estimation
#' #
#'
#' # If missing, the unit and time are automatically deduced from
#' # the panel.id used in the estimation
#' vcov_NW(est, lag = 2)
#'
#'
#' @name vcov_hac
NULL

#' @rdname vcov_hac
vcov_DK = function(x, time = NULL, lag = NULL, ssc = NULL, vcov_fix = TRUE){
  # unit and time MUST be variables of the data set
  # otherwise: too error prone + extremely complex to set up + very edge case, so not worth it
  #

  # slide_args allows the implicit allocation of arguments
  # it makes semi-global changes => the values of the args here are modified
  slide_args(x, time = time, lag = lag, ssc = ssc)
  IS_REQUEST = is.null(x)

  check_value(lag, "NULL integer scalar GE{1}")
  check_value(ssc, "NULL class(ssc.type)", 
              .message = "The argument 'ssc' must be an object created by the function ssc().")

  # time
  check_value(time, "NULL character scalar | os formula")

  if(inherits(time, "formula")){
    time_txt = fml2varnames(time)
    check_value(time_txt, "character scalar", 
                .message = "The argument 'time' must be composed of only one variable.")
  }

  # recreating the call
  vcov = .xpd(lhs = "dk", rhs = time)

  if(!is.null(lag) || !is.null(ssc)){
    extra_args = list(lag = lag)
    vcov_request = list(vcov = vcov, ssc = ssc, extra_args = extra_args)
    class(vcov_request) = "fixest_vcov_request"
  } else {
    # Everything can fit into a vcov formula
    vcov_request = vcov
  }

  if(IS_REQUEST){
    res = vcov_request
  } else {
    res = vcov(x, vcov = vcov_request, vcov_fix = vcov_fix)
  }

  res

}

#' @rdname vcov_hac
vcov_NW = function(x, unit = NULL, time = NULL, lag = NULL, ssc = NULL, vcov_fix = TRUE){
  # unit and time MUST be variables of the data set
  # otherwise: too error prone + extremely complex to set up + very edge case, so not worth it
  #

  # slide_args allows the implicit allocation of arguments
  # it makes semi-global changes => the values of the args here are modified
  slide_args(x, unit = unit, time = time, lag = lag, ssc = ssc)
  IS_REQUEST = is.null(x)

  check_value(lag, "NULL integer scalar GE{1}")
  check_value(ssc, "NULL class(ssc.type)", 
              .message = "The argument 'ssc' must be an object created by the function ssc().")

  # unit
  check_value(unit, "NULL character scalar | os formula")

  if(inherits(unit, "formula")){
    unit_txt = fml2varnames(unit)
    check_value(unit_txt, "character scalar", 
                .message = "The argument 'unit' must be composed of only one variable.")
  }

  # time
  check_value(time, "NULL character scalar | os formula")

  if(inherits(time, "formula")){
    time_txt = fml2varnames(time)
    check_value(time_txt, "character scalar", 
                .message = "The argument 'time' must be composed of only one variable.")
  }

  # recreating the call
  vcov = .xpd(lhs = "nw", rhs = c(unit, time))

  if(!is.null(lag) || !is.null(ssc)){
    extra_args = list(lag = lag)
    vcov_request = list(vcov = vcov, ssc = ssc, extra_args = extra_args)
    class(vcov_request) = "fixest_vcov_request"
  } else {
    # Everything can fit into a vcov formula
    vcov_request = vcov
  }

  if(IS_REQUEST){
    res = vcov_request
  } else {
    res = vcov(x, vcov = vcov_request, vcov_fix = vcov_fix)
  }

  res

}


#' Conley VCOV
#'
#' Compute VCOVs robust to spatial correlation, a la Conley (1999).
#'
#' This function computes VCOVs that are robust to spatial correlations by assuming a correlation 
#' between the units that are at a geographic distance lower than a given cutoff.
#'
#' The kernel is uniform.
#'
#' If the cutoff is not provided, an estimation of it is given. This cutoff ensures that a minimum 
#' of units lie within it and is robust to sub-sampling. This automatic cutoff is only here for 
#' convenience, the most appropriate cutoff shall depend on the application and shall be provided 
#' by the user.
#'
#' The function `conley` does not compute VCOVs directly but is meant to be used in the argument 
#' `vcov` of `fixest` functions (e.g. in [`vcov.fixest`] or even in the estimation calls).
#'
#' @inheritParams vcov_cluster
#'
#' @param lat A character scalar or a one sided formula giving the name of the variable 
#' representing the latitude. The latitude must lie in \[-90, 90\], \[0, 180\] or \[-180, 0\].
#' @param lon A character scalar or a one sided formula giving the name of the variable 
#' representing the longitude. The longitude must be in \[-180, 180\], \[0, 360\] or \[-360, 0\].
#' @param cutoff The distance cutoff, in km. You can express the cutoff in miles by writing the 
#' number in character form and adding "mi" as a suffix: cutoff = "100mi" would be 100 miles. If 
#' missing, a rule of thumb is used to deduce the cutoff.
#' @param pixel A positive numeric scalar, default is 0. If a positive number, the coordinates of 
#' each observation are pooled into `pixel` x `pixel` km squares. This lowers the precision but can 
#' (depending on the cases) greatly improve computational speed at a low precision cost. Note that 
#' if the `cutoff` was expressed in miles, then `pixel` will also be in miles.
#' @param distance How to compute the distance between points. It can be equal to "triangular" 
#' (default) or "spherical". The latter case corresponds to the great circle distance and is more 
#' precise than triangular but is a bit more intensive computationally.
#'
#' @return
#' If the first argument is a `fixest` object, then a VCOV is returned (i.e. a symmetric matrix).
#'
#' If the first argument is not a `fixest` object, then a) implicitly the arguments are shifted to 
#' the left (i.e. `vcov_conley("lat", "long")` is equivalent to 
#' `vcov_conley(lat = "lat", lon = "long")`) and b) a VCOV-*request* is returned and NOT a VCOV. 
#' That VCOV-request can then be used in the argument `vcov` of various `fixest` functions 
#' (e.g. [`vcov.fixest`] or even in the estimation calls).
#'
#' @references
#' Conley TG (1999). "GMM Estimation with Cross Sectional Dependence", *Journal of Econometrics*, 92, 1-45.
#'
#' @examples
#'
#' data(quakes)
#'
#' # We use conley() in the vcov argument of the estimation
#' feols(depth ~ mag, quakes, conley(100))
#'
#' # Post estimation
#' est = feols(depth ~ mag, quakes)
#' vcov_conley(est, cutoff = 100)
#'
#'
#'
vcov_conley = function(x, lat = NULL, lon = NULL, cutoff = NULL, pixel = 0,
                       distance = "triangular", ssc = NULL, vcov_fix = TRUE){

  # slide_args allows the implicit allocation of arguments
  # it makes semi-global changes => the values of the args here are modified
  slide_args(x, lat = lat, lon = lon, cutoff = cutoff, pixel = pixel, 
             distance = distance, ssc = ssc)
  IS_REQUEST = is.null(x)

  check_value(ssc, "NULL class(ssc.type)", 
              .message = "The argument 'ssc' must be an object created by the function ssc().")

  # lat
  check_value(lat, "NULL character scalar | os formula")

  if(inherits(lat, "formula")){
    lat_txt = fml2varnames(lat)
    check_value(lat_txt, "character scalar", 
                .message = "The argument 'lat' must be composed of only one variable.")
  }


  # lon
  check_value(lon, "NULL character scalar | os formula")

  if(inherits(lon, "formula")){
    lon_txt = fml2varnames(lon)
    check_value(lon_txt, "character scalar", 
                .message = "The argument 'lon' must be composed of only one variable.")
  }

  # cutoff
  cutoff = check_set_cutoff(cutoff)

  # pixel
  check_value(pixel, "numeric scalar GE{0}",
              .prefix = "In vcov.fixest, Conley VCOV cannot be computed: the argument 'pixel'")

  # distance
  check_set_value(distance, "match(triangular, spherical)")

  # recreating the call
  vcov = .xpd(lhs = "conley", rhs = c(lat, lon))

  extra_args = list(cutoff = cutoff, pixel = pixel, distance = distance)
  vcov_request = list(vcov = vcov, ssc = ssc, extra_args = extra_args)
  class(vcov_request) = "fixest_vcov_request"

  if(IS_REQUEST){
    res = vcov_request
  } else {
    res = vcov(x, vcov = vcov_request, vcov_fix = vcov_fix)
  }

  res
}


####
#### SETUP ####
####



vcov_setup = function(){

  #
  # iid
  #

  vcov_iid_setup = list(name = c("iid", "normal", "standard"), 
                        fun_name = "vcov_iid_internal", 
                        vcov_label = "IID")

  #
  # Heteroskedasticity robust
  #

  vcov_hetero_setup = list(name = c("hetero", "white", "hc1"), 
                           fun_name = "vcov_hetero_internal", 
                           vcov_label = "Heteroskedasticity-robust")

  #
  # cluster(s)
  #

  vcov_clust_setup = list(name = c("cluster", ""), 
                          fun_name = "vcov_cluster_internal", 
                          vcov_label = "Clustered")
  
  vcov_clust_setup$vars = list(
    cl1 = list(guess_from = list(panel.id = 1, fixef = 1), 
               label = "clusters", to_int = TRUE, rm_nested = TRUE),
    cl2 = list(optional = TRUE, label = "second cluster", to_int = TRUE, rm_nested = TRUE),
    cl3 = list(optional = TRUE, label = "third cluster", to_int = TRUE, rm_nested = TRUE),
    cl4 = list(optional = TRUE, label = "fourth cluster", to_int = TRUE, rm_nested = TRUE)
  )
  
  vcov_clust_setup$patterns = c("", "cl1", "cl1 + cl2", "cl1 + cl2 + cl3",
                                "c1 + cl2 + cl3 + cl4")

  # Other keywords => direct two-/three-/four-way clustering
  cl1 = list(guess_from = list(fixef = 1), label = "first cluster", 
             to_int = TRUE, rm_nested = TRUE)
  cl2 = list(guess_from = list(fixef = 2), label = "second cluster", 
             to_int = TRUE, rm_nested = TRUE)
  cl3 = list(guess_from = list(fixef = 3), label = "third cluster", 
             to_int = TRUE, rm_nested = TRUE)
  cl4 = list(guess_from = list(fixef = 4), label = "fourth cluster", 
             to_int = TRUE, rm_nested = TRUE)

  vcov_twoway_setup = list(name = "twoway", 
                           fun_name = "vcov_cluster_internal", 
                           vcov_label = "Clustered")
  vcov_twoway_setup$vars = list(cl1 = cl1, cl2 = cl2)
  vcov_twoway_setup$patterns = c("", "cl1 + cl2")

  vcov_threeway_setup = list(name = "threeway", 
                             fun_name = "vcov_cluster_internal", 
                             vcov_label = "Clustered")
  vcov_threeway_setup$vars = list(cl1 = cl1, cl2 = cl2, cl3 = cl3)
  vcov_threeway_setup$patterns = c("", "cl1 + cl2 + cl3")

  vcov_fourway_setup = list(name = "fourway", 
                            fun_name = "vcov_cluster_internal", 
                            vcov_label = "Clustered")
  vcov_fourway_setup$vars = list(cl1 = cl1, cl2 = cl2, cl3 = cl3, cl4 = cl4)
  vcov_fourway_setup$patterns = c("", "cl1 + cl2 + cl3 + cl4")


  #
  # newey_west
  #

  # The variables
  unit = list(guess_from = list(panel.id = 1), 
              label = "panel ID", to_int = TRUE, rm_nested = TRUE,
              optional = TRUE)
  time = list(guess_from = list(panel.id = 2), label = "time", rm_nested = TRUE)

  vcov_newey_west_setup = list(name = c("NW", "newey_west"),
                               fun_name = "vcov_newey_west_internal",
                               vcov_label = "Newey-West")

  vcov_newey_west_setup$vars = list(unit = unit, time = time)
  vcov_newey_west_setup$arg_main = "lag"
  vcov_newey_west_setup$rdname = "vcov_hac"
  vcov_newey_west_setup$patterns = c("", "time", "unit + time")


  #
  # driscoll_kraay
  #

  vcov_driscoll_kraay_setup = list(name = c("DK", "driscoll_kraay"),
                                   fun_name = "vcov_driscoll_kraay_internal",
                                   vcov_label = "Driscoll-Kraay")

  vcov_driscoll_kraay_setup$vars = list(time = time)
  vcov_driscoll_kraay_setup$arg_main = "lag"
  vcov_driscoll_kraay_setup$rdname = "vcov_hac"
  vcov_driscoll_kraay_setup$patterns = c("", "time")


  #
  # conley
  #

  # The variables
  lat = list(guess_from = list(regex = c("^lat(itude)?$", "^lat_.+")),
             label = "latitude",
             expected_type = "numeric vector",
             rm_nested = TRUE)

  lng = list(guess_from = list(regex = c("^(lon|lng|long|longitude)$", "^(lng|lon|long)_.+")),
             label = "longitude",
             expected_type = "numeric vector",
             rm_nested = TRUE)

  vcov_conley_setup = list(name = "conley", 
                           fun_name = "vcov_conley_internal", 
                           vcov_label = "Conley")
  vcov_conley_setup$vars = list(lat = lat, lng = lng)
  vcov_conley_setup$arg_main = c("cutoff", "pixel", "distance")
  vcov_conley_setup$rdname = "vcov_conley"
  vcov_conley_setup$patterns = c("", "lat + lng")


  #
  # conley hac
  #

  vcov_conley_hac_setup = list(name = c("conley_hac", "hac_conley"), 
                               fun_name = "vcov_conley_hac_internal", 
                               vcov_label = "Conley-HAC")
  # The variables (already defined earlier)
  unit_conleyHAC = unit
  unit_conleyHAC$optional = NULL
  vcov_conley_hac_setup$vars = list(lat = lat, lng = lng, unit = unit_conleyHAC, time = time)
  vcov_conley_hac_setup$arg_main = c("cutoff", "pixel", "distance", "lag")
  vcov_conley_hac_setup$patterns = c("", "lat + lng", "time", "lat + lng + unit + time")

  #
  # Saving all the vcov possibilities
  #

  all_vcov = list(vcov_iid_setup,
                  vcov_hetero_setup,
                  vcov_clust_setup,
                  vcov_twoway_setup,
                  vcov_threeway_setup,
                  vcov_fourway_setup,
                  vcov_newey_west_setup,
                  vcov_driscoll_kraay_setup,
                  vcov_conley_setup)
  # vcov_conley_hac_setup)

  options(fixest_vcov_builtin = all_vcov)

}



####
#### Internal ####
####




vcovClust = function (cluster, myBread, scores, adj = FALSE, do.unclass = TRUE, 
                      sandwich = TRUE, nthreads = 1){
  # Internal function: no need for controls, they come beforehand
  # - cluster: the vector of dummies
  # - myBread: original vcov
  # - scores
  # Note: if length(unique(cluster)) == n (i.e. White correction), then the adj are such that vcovClust is equivalent to vcovHC(res, type="HC1")
  # Source: http://cameron.econ.ucdavis.edu/research/Cameron_Miller_JHR_2015_February.pdf
  #         Cameron & Miller -- A Practitioner's Guide to Cluster-Robust Inference

  n = NROW(scores)

  # Control for cluster type
  if(do.unclass){
    cluster = quickUnclassFactor(cluster)
  }

  Q = max(cluster)
  RightScores = cpp_tapply_sum(Q, scores, cluster)

  # Finite sample correction:
  if(adj){
    adj_value = Q / (Q - 1)
  } else {
    adj_value = 1
  }

  if(!sandwich){
    res = cpp_crossprod(RightScores, 1, nthreads) * adj_value
    return(res)
  }

  xy = cpp_matprod(RightScores, myBread, nthreads)
  res = cpp_crossprod(xy, 1, nthreads) * adj_value
  res
}


vcov_iid_internal = function(bread, ...){
  bread
}

vcov_hetero_internal = function(bread, scores, sandwich, ssc, nthreads, ...){

  if(!sandwich){
    vcov_noAdj = cpp_crossprod(scores, 1, nthreads)
  } else {
    # we make a n/(n-1) adjustment to match vcovHC(type = "HC1")

    n = nrow(scores)
    adj = ifelse(ssc$cluster.adj, n / (n-1), 1)

    vcov_noAdj = cpp_crossprod(cpp_matprod(scores, bread, nthreads), 1, nthreads) * adj
  }

  vcov_noAdj
}


vcov_cluster_internal = function(bread, scores, vars, ssc, sandwich, nthreads, var_names_all, ...){

  # aliasing to add (a bit of) clarity
  cluster = vars
  nway = length(cluster)

  # initialization
  vcov = bread * 0
  meat = 0

  for(i in 1:nway){

    myComb = combn(nway, i)

    power = floor(1 + log10(sapply(cluster, max)))

    for(j in 1:ncol(myComb)){

      if(i == 1){
        index = cluster[[myComb[, j]]]

      } else if(i > 1){

        vars = myComb[, j]

        if(sum(power[vars]) > 14){
          my_clusters = cluster[vars]

          order_index = do.call(order, my_clusters)
          index = cpp_combine_clusters(my_clusters, order_index)
        } else {
          # quicker, but limited by the precision of integers
          index = cluster[[vars[1]]]
          for(k in 2:length(vars)){
            index = index + cluster[[vars[k]]]*10**sum(power[vars[1:(k-1)]])
          }
        }

        index = quickUnclassFactor(index)

      }

      vcov = vcov + (-1)**(i+1) * vcovClust(index, bread, scores,
                                            adj = ssc$cluster.adj && ssc$cluster.df == 
                                              "conventional",
                                            sandwich = sandwich, nthreads = nthreads)

    }
  }

  G_min = min(sapply(cluster, max))
  if(ssc$cluster.adj && ssc$cluster.df == "min"){
    vcov = vcov * G_min / (G_min - 1)
    attr(vcov, "G") = G_min
  }

  if(!sandwich){
    return(vcov)
  }

  # I know I duplicate the information, but they refer to two different things
  attr(vcov, "min_cluster_size") = G_min

  var_names_all = var_names_all[nchar(var_names_all) > 0]
  if(length(var_names_all) > 0){
    attr(vcov, "type_info") = sma(" ({' & 'c ? var_names_all})")
  } else {
    attr(vcov, "type") = switch(nway,
                                "1" = "Clustered",
                                "2" = "Two-way",
                                "3" = "Three-way",
                                "4" = "Four-way")
  }

  vcov
}


vcov_newey_west_internal = function(bread, scores, vars, ssc, sandwich, nthreads, lag = NULL, ...){
  # Function that computes Newey-West VCOV

  # Setting up

  unit = vars$unit
  time = to_integer(vars$time, sorted = TRUE)

  n_time = max(time)

  # The bartlett weights
  set_weights = function(lag){
    res = seq(1, 0, by = -(1/(lag + 1)))
    # we must halve the first weight (we'll add the transpose internally)
    res[1] = 0.5
    res
  }

  is_panel = !is.null(unit)
  if(is_panel){

    # Lag: simple rule of thumb
    if(missnull(lag)){
      lag = floor(n_time^(1/4))
    }

    w = set_weights(lag)

    my_order = order(time, unit)
    time_ro = time[my_order]
    unit_ro = unit[my_order]

    dup = cpp_find_duplicates(unit_ro, time_ro)

    if(dup$n_dup > 0){
      stop("In vcov.fixest, Newey-West VCOV cannot be computed: there are (unit x time) duplicates. You have to sort that out first, eg by creating new units free of duplicates or dropping duplicates. Or you can use a Driscoll-Kraay VCOV for which duplicates does not matter.", call. = FALSE)
    }

    scores_ro = scores[my_order, , drop = FALSE]

    n_unit = max(unit_ro)

    meat = cpp_newey_west_panel(scores_ro, w, unit_ro, n_unit,
                                time_ro, n_time, nthreads)

  } else {

    if(max(time) < length(time)){
      stop("In vcov.fixest, Newey-West VCOV cannot be computed: there are time duplicates. You may provide a panel identifier (if relevant) to sort that out. Or you can use a Driscoll-Kraay VCOV for which duplicates does not matter.", call. = FALSE)
    }

    my_order = order(time)
    time_ro = time[my_order]
    scores_ro = scores[my_order, , drop = FALSE]

    # Lag: the default relies on bwNeweyWest
    if(missnull(lag)){

      # Later => feed in directly the matrix when the new version of sandwich is ready

      # we drop the intercept (except if there's only the intercept)
      is_intercept = (colnames(scores_ro) == "(Intercept)") & (ncol(scores_ro) > 1)
      if(any(is_intercept)){
        x = structure(list(scores = scores_ro[, !is_intercept, drop = FALSE]), class = "fixest")
      } else {
        x = structure(list(scores = scores_ro), class = "fixest")
      }

      lag = sandwich::bwNeweyWest(x, weights = 1)
      lag = floor(lag)
    }

    w = set_weights(lag)

    meat = cpp_newey_west(scores_ro, w, nthreads)
  }

  if(sandwich){
    vcov = prepare_sandwich(bread, meat, nthreads)
  } else {
    vcov = meat
  }

  if(ssc$cluster.adj){
    vcov = vcov * n_time / (n_time - 1)
  }

  attr(vcov, "G") = n_time
  attr(vcov, "min_cluster_size") = n_time

  attr(vcov, "type_info") = paste0(" (L=", lag, ")")

  vcov
}


vcov_driscoll_kraay_internal = function(bread, scores, vars, ssc, sandwich, 
                                        nthreads, lag = NULL, ...){
  # Function that computes Driscoll-Kraay VCOV

  # Setting up
  time = to_integer(vars$time, sorted = TRUE)

  n_time = max(time)

  # Lag: simple rule of thumb
  if(missnull(lag)){
    lag = floor(n_time^(1/4))
  }

  w = seq(1, 0, by = -(1/(lag + 1)))
  # we halve the first weight (since we add the transpose in the internal code)
  w[1] = 0.5

  my_order = order(time)
  time_ro = time[my_order]
  scores_ro = scores[my_order, , drop = FALSE]

  meat = cpp_driscoll_kraay(scores_ro, w, time_ro, n_time, nthreads)

  if(sandwich){
    vcov = prepare_sandwich(bread, meat, nthreads)
  } else {
    vcov = meat
  }

  if(ssc$cluster.adj){
    vcov = vcov * n_time / (n_time - 1)
  }

  attr(vcov, "G") = n_time
  attr(vcov, "min_cluster_size") = n_time

  attr(vcov, "type_info") = paste0(" (L=", lag, ")")

  vcov
}

vcov_conley_internal = function(bread, scores, vars, sandwich, nthreads,
                                cutoff = NULL, pixel = 0, distance = "triangular", ...){

  lon = vars$lng
  lat = vars$lat

  #
  # START :: checks

  # lon
  lon_range = range(lon)
  if(lon_range[1] < -360 || lon_range[2] > 360 || diff(lon_range) > 360){
    stop("In vcov.fixest, Conley VCOV cannot be computed: the longitude is outside the [-180, 180], [0, 360] or [-360, 0] range (current range is [", fsignif(lon_range[1]), ", ", fsignif(lon_range[2]), "]).", call. = FALSE)
  }

  # Later:
  # - I think that I really don't need to rescale but I should check first in the cpp code
  # that I don't use the -180/180 bounds
  if(lon_range[2] > 180){
    lon = lon - 180
  } else if(lon_range[1] < -180){
    lon = lon + 180
  }

  # lat
  lat_range = range(lat)
  if(lat_range[1] < -180 || lat_range[2] > 180 || diff(lat_range) > 180){
    stop("In vcov.fixest, Conley VCOV cannot be computed: the latitude is outside the [-90, 90], [0, 180] or [-180, 0] range (current range is [", fsignif(lat_range[1]), ", ", fsignif(lat_range[2]), "]).", call. = FALSE)
  }

  if(lat_range[2] > 90){
    # we normalize
    lat = lat - 90
  } else if(lat_range[1] < -90){
    lat = lat + 90
  }

  # cutoff
  cutoff = check_set_cutoff(cutoff)

  if(is.null(cutoff)){
    cutoff = cutoff_deduce(lat, lon)
  }

  # pixel
  check_value(pixel, "numeric scalar GE{0}",
              .prefix = "In vcov.fixest, Conley VCOV cannot be computed: the argument 'pixel'")

  # distance
  check_set_value(distance, "match(triangular, spherical)")
  distance = switch(distance,
                    spherical = 1,
                    triangular = 2)

  #   END :: checks
  #

  metric = attr(cutoff, "metric")
  if(metric == "mi"){
    pixel = pixel * 1.60934
  }

  if(pixel > 0){
    pixel_lat = pixel / 111
    lat_cell = ((lat + 90) %/% pixel_lat) * pixel_lat - 90

    pixel_lon = pixel / (111 * cos(lat * pi / 180))
    lon_cell = ((lon + 180) %/% pixel_lon) * pixel_lon - 180
  } else {
    lat_cell = lat
    lon_cell = lon
  }

  deg2rad = function(x) x / 180 * pi

  id_full = to_integer(lat_cell, lon_cell, sorted = TRUE, add_items = TRUE, 
                       items.list = TRUE, multi.df = TRUE)

  lon_ro = deg2rad(id_full$items$lon_cell)
  lat_ro = deg2rad(id_full$items$lat_cell)

  n_cases = length(lon_ro)
  scores_ro = cpp_tapply_sum(n_cases, scores, id_full$x)

  meat = cpp_vcov_conley(scores_ro, lon_ro, lat_ro, distance = distance, 
                         cutoff = cutoff, nthreads = nthreads)

  if(sandwich){
    vcov = prepare_sandwich(bread, meat, nthreads)
  } else {
    vcov = meat
  }

  scale = if(metric == "km") 1 else 1 / 1.60934

  attr(vcov, "type_info") = paste0(" (", cutoff * scale, metric, ")")

  vcov
}

####
#### Utilities ####
####


oldargs_to_vcov = function(se, cluster, vcov, .vcov = NULL){
  # Transforms se + cluster + .vcov into a vcov call

  set_up(1)

  if(missnull(se) && missnull(cluster) && missnull(.vcov)){
    if(!missnull(vcov)){
      return(vcov)
    } else {
      return(NULL)
    }
  }

  if(!missnull(vcov)){

    if(is_function_in_it(vcov)){
      return(vcov)

    } else {
      id = c(!missnull(se), !missnull(cluster), !missnull(.vcov))
      msg = c("se", "cluster", ".vcov")[id]
      stop_up("You cannot use the argument 'vcov' in combination with {enum.q.or ? msg}.")
    }
  }

  if(!missnull(.vcov)){

    if(!is_function_in_it(.vcov)){
      if(!missnull(se) || !missnull(cluster)){
        id = c(!missnull(se), !missnull(cluster))
        msg = c("se", "cluster")[id]
        stop_up("You cannot use the argument '.vcov' in combination with {enum.q.or ? msg}.")
      }
    }

    check_arg(.vcov, "matrix | function")

    vcov = .vcov
    attr(vcov, "deparsed_arg") = fetch_arg_deparse(".vcov")

  } else {
    all_vcov = getOption("fixest_vcov_builtin")
    all_vcov_names = unlist(lapply(all_vcov, `[[`, "name"))
    all_vcov_names = all_vcov_names[nchar(all_vcov_names) > 0]

    if(missnull(se)) se = "cluster"
    check_set_value(se, "match", .choices = all_vcov_names, 
                    .prefix = "Argument 'se' (which has been replaced by arg. 'vcov')")

    if(missnull(cluster)){
      vcov = se

    } else if(!se %in% c("cluster", "twoway", "threeway", "frouway")){
      stop_up("The VCOV requested with the argument se = '", se, "' is not compatible with the use of the argument 'cluster' (i.e. you have to choose!).")

    } else {
      if(inherits(cluster, "formula")){
        vcov = cluster
      } else if(is.character(cluster) && length(cluster) <= 4){
        vcov = .xpd(lhs = "cluster", rhs = cluster)
      } else {
        # We create a vcov request
        vcov = vcov_cluster(cluster = cluster)
      }
    }

    assign("se", NULL, parent.frame())
    assign("cluster", NULL, parent.frame())
  }

  vcov
}

catch_fun_args = function(fun, dots, exclude_args = NULL, erase_args = FALSE, keep_dots = FALSE){

  arg_names = formalArgs(fun)

  if(any(exclude_args %in% arg_names)){
    fname = deparse(substitute(fun))
    pblm = intersect(exclude_args, arg_names)
    stop_up("When the argument {bq?fname} is a function, this function cannot contain any argument named {enum.q.or ? pblm} since {$(this name is;these names are)} reserved.")
  }

  # we keep only the variables in formalArgs
  if(keep_dots){
    arg_list = dots
  } else {
    vars = intersect(names(dots), arg_names)
    arg_list = list()
    if(length(vars) > 0){
      arg_list[vars] = dots[vars]
    }
  }

  # variables from the main function call
  sysOrigin = sys.parent()
  mc = match.call(sys.function(sysOrigin), sys.call(sysOrigin), expand.dots = FALSE)
  args_previous = setdiff(names(mc)[-1], c("", "..."))

  for(var in intersect(args_previous, arg_names)){
    arg_list[[var]] = eval(str2expression(var), parent.frame())
    # NOTA: shall I put to null these arguments? Since they are now used in
    # this function... I don't know... I think it's a good idea but there can be problems.
    # Let's do it then! I'm just adding the argument erase_args.
    #
    # I think it's a good idea bc it's super easy to do it here while it's cumbersome to
    # do it in the calling fun

    if(erase_args) assign(var, NULL, parent.frame())
  }

  arg_list
}


slide_args = function(x, ...){
  # if the first argument is implicitly provided
  # slides the arguments by one slot
  #
  # we assign directly in the parent frame

  sysOrigin = sys.parent()
  sc = sys.calls()[[sysOrigin]]
  mc = match.call(definition = sys.function(sysOrigin), call = sys.call(sysOrigin))

  dots = list(...)

  if(!"x" %in% names(sc)){
    if(!missing(x)){
      if(inherits(x, "fixest")){
        # OK, regular x
      } else {
        # => implicit specification
        # sliding

        implicit_args = setdiff(names(mc), c(names(sc), ""))
        remaining_args = setdiff(names(dots), names(sc))

        for(i in seq_along(implicit_args)){
          if(implicit_args[i] == "x"){
            value = x
          } else {
            value = dots[[implicit_args[i]]]
          }

          assign(remaining_args[i], value, parent.frame())
        }

        assign("x", NULL, parent.frame())

      }
    } else {
      assign("x", NULL, parent.frame())
    }
  } else {
    if(missing(x)){
      assign("x", NULL, parent.frame())
    } else {
      check_arg(x, "class(fixest)", .up = 1)
    }
  }
}


prepare_sandwich = function(bread, meat, nthreads = 1){
  cpp_matprod(cpp_matprod(bread, meat, nthreads), bread, nthreads)
}

check_set_cutoff = function(cutoff){

  set_up(1)

  if(is.null(cutoff)){
    return(NULL)
    # stop("In vcov.fixest, Conley VCOV cannot be computed: the argument 'cutoff' (the cutoff distance in km) must be provided.", call. = FALSE)
  }

  check_value(cutoff, "numeric scalar GE{0} | character scalar",
        .message = "In vcov.fixest, Conley VCOV cannot be computed: the argument 'cutoff' must be a numeric for kilometers, or a number in character form with the 'mi' suffix for miles (ex: 100 means 100km, '100mi' means 100 miles).")

  metric = attr(cutoff, "metric")
  if(is.null(metric)) metric = "km"

  if(is.character(cutoff)){
    if(!grepl("^[[:digit:]]+ ?((m|mi|mil|mile|miles)|(k|km))?$", cutoff)){
      stop("In vcov.fixest, Conley VCOV cannot be computed: the argument 'cutoff', equal to '", cutoff,  "' is not valid. It must be either a numeric scalar or a number in character form with the suffix 'mi' (ex: 100 means 100km, '100mi' means 100 miles).", call. = FALSE)
    }

    if(grepl("k", cutoff)){
      cutoff = as.numeric(gsub("k.*", "", cutoff))
    } else if(grepl("m", cutoff)){
      cutoff = as.numeric(gsub("m.*", "", cutoff)) * 1.60934
      metric = "mi"
    } else {
      cutoff = as.numeric(cutoff)
    }
  }

  attr(cutoff, "metric") = metric

  cutoff
}



gen_vcov_aliases = function(){
  # only for vcov functions having one or more main argument

  all_vcov = getOption("fixest_vcov_builtin")

  fun_core = '
#\' @rdname __RDNAME__
__FUN__ = function(__ARGS__){
  extra_args = list(__EXTRA__)
  vcov_request = list(vcov = "__VCOV__", extra_args = extra_args)
  class(vcov_request) = "fixest_vcov_request"
  vcov_request
}'


  text = c()
  all_fun_names = c()

  for(i in seq_along(all_vcov)){
    vcov_current = all_vcov[[i]]

    arg_main = vcov_current$arg_main
    if(!is.null(arg_main)){
      vcov_names_all = vcov_current$name

      vcov_name = vcov_names_all[1]
      rdname = vcov_current$rdname
      args = paste0(arg_main, " = NULL", collapse = ", ")
      extra = paste0(arg_main, " = ", arg_main, collapse = ", ")

      core = gsub("__RDNAME__", rdname, fun_core)
      core = gsub("__VCOV__", vcov_name, core)
      core = gsub("__ARGS__", args, core)
      core = gsub("__EXTRA__", extra, core)

      for(fname in vcov_names_all){
        text = c(text, gsub("__FUN__", fname, core, fixed = TRUE))
        all_fun_names = c(all_fun_names, fname)
      }
    }
  }

  # Writing the functions
  intro = c("# Do not edit by hand\n# => aliases some VCOV functions\n\n\n")

  text = c(intro, text)

  update_file("./R/alias_VCOV.R", text)

  # We also add the exports to the name space
  header = "# Auto-exports::vcov_aliases"
  fun_line = paste0("export(", paste0(all_fun_names, collapse = ", "), ")")

  NAMESPACE = readLines("NAMESPACE")

  if(header %in% NAMESPACE){
    # update
    qui = which(NAMESPACE == header)
    NAMESPACE[qui + 1] = fun_line
  } else {
    # creation
    NAMESPACE = c(NAMESPACE, "\n\n", header, fun_line, "\n\n")
  }

  update_file("NAMESPACE", NAMESPACE)


}


great_circle_distance = function (lat1, long1, lat2, long2){
  deg2rad = function(x) x / 180 * pi

  lat1 = deg2rad(lat1)
  lat2 = deg2rad(lat2)
  long1 = deg2rad(long1)
  long2 = deg2rad(long2)

  R = 6371
  delta.long = (long2 - long1)
  delta.lat = (lat2 - lat1)
  a = sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
  c = 2 * asin(pmin(1, sqrt(a)))
  d = R * c

  return(d)
}


cutoff_deduce = function(lat, lon){
  # To deduce the cutoff, we apply a basic rule of thumb

  quoi = unique(data.frame(lat, lon))

  order_latlon = with(quoi, order(lat, lon))
  order_lonlat = with(quoi, order(lon, lat))

  quoi_latlon = quoi[order_latlon, ]
  quoi_lonlat = quoi[order_lonlat, ]

  N = nrow(quoi)
  keep = 1:(N - 3)

  # To increase stability, we take the closest among the 3 closest lat or long
  d_latlon_1 = with(quoi_latlon, great_circle_distance(lat[-N], lon[-N], lat[-(1:1)], lon[-(1:1)]))
  d_latlon_2 = with(quoi_latlon, great_circle_distance(lat[-((N - 1):N)], lon[-((N - 1):N)], lat[-(1:2)], lon[-(1:2)]))
  d_latlon_3 = with(quoi_latlon, great_circle_distance(lat[-((N - 2):N)], lon[-((N - 2):N)], lat[-(1:3)], lon[-(1:3)]))

  d_lonlat_1 = with(quoi_lonlat, great_circle_distance(lat[-N], lon[-N], lat[-1], lon[-1]))
  d_lonlat_2 = with(quoi_lonlat, great_circle_distance(lat[-((N - 1):N)], lon[-((N - 1):N)], lat[-(1:2)], lon[-(1:2)]))
  d_lonlat_3 = with(quoi_lonlat, great_circle_distance(lat[-((N - 2):N)], lon[-((N - 2):N)], lat[-(1:3)], lon[-(1:3)]))

  d_latlon = pmin(d_latlon_1[keep], d_latlon_2[keep], d_latlon_3[keep])
  d_lonlat = pmin(d_lonlat_1[keep], d_lonlat_2[keep], d_lonlat_3[keep])

  # We take twice the median distance => corresponds to two units radius
  # roughly 50% of units have at least around 8 neighbors (=> we ensure a minimum)
  cutoff = (median(d_latlon) + median(d_lonlat))
  # we round it to make it nicer and more robust
  m = floor(log10(cutoff)) - 1
  if(cutoff > 20) m = max(m, 1)
  cutoff = (cutoff %/% 10**m) * 10**m

  attr(cutoff, "metric") = "km"

  cutoff
}

is_function_in_it = function(x){
  if(is.function(x)){
    return(TRUE)
  } else if(is.list(x) && !inherits(x, "fixest_vcov_request") && length(x) > 0 && is.function(x[[1]])){
    return(TRUE)
  }
  return(FALSE)
}

####
#### SET / GET ========== ####
####



#' @rdname ssc
#'
#' @param ssc.type An object of class `ssc.type` obtained with the function [`ssc`].
setFixest_ssc = function(ssc.type = ssc()){

  if(!"ssc.type" %in% class(ssc.type)){
    stop("The argument 'ssc' must be an object created by the function ssc().")
  }

  options("fixest_ssc" = ssc.type)
}

#' @rdname ssc
getFixest_ssc = function(){

  ssc = getOption("fixest_ssc")
  if(!"ssc.type" %in% class(ssc)){
    stop("The value of getOption(\"fixest_ssc\") is currently not legal. Please use function setFixest_dict to set it to an appropriate value.")
  }

  ssc
}


#' Sets the default type of standard errors to be used
#'
#' This functions defines or extracts the default type of standard-errors to computed in 
#' `fixest` [`summary`][fixest::summary.fixest], and [`vcov`][fixest::vcov.fixest].
#'
#' @param no_FE Character scalar equal to either: `"iid"` (default), or `"hetero"`. The type 
#' of standard-errors to use by default for estimations without fixed-effects.
#' @param one_FE Character scalar equal to either: `"iid"`, `"hetero"`, or `"cluster"` (default). 
#' The type of standard-errors to use by default for estimations with *one* fixed-effect.
#' @param two_FE Character scalar equal to either: `"iid"`, `"hetero"`, `"cluster"` (default), or 
#' `"twoway"`. The type of standard-errors to use by default for estimations with *two or more* 
#' fixed-effects.
#' @param panel Character scalar equal to either: `"iid"`, `"hetero"`, `"cluster"` (default), or 
#' `"driscoll_kraaay"`. The type of standard-errors to use by default for estimations with the 
#' argument `panel.id` set up. Note that panel has precedence over the presence of fixed-effects.
#' @param all Character scalar equal to either: `"iid"`, or `"hetero"`. By default is is NULL. If 
#' provided, it sets all the SEs to that value.
#' @param reset Logical, default is `FALSE`. Whether to reset to the default values.
#'
#' @return
#' The function `getFixest_vcov()` returns a list with three elements containing the default for 
#' estimations i) without, ii) with one, or iii) with two or more fixed-effects.
#' 
#'
#' @examples
#'
#' # By default:
#' # - no fixed-effect (FE): standard
#' # - one or more FEs: cluster
#' # - panel: cluster on panel id
#'
#' data(base_did)
#' est_no_FE  = feols(y ~ x1, base_did)
#' est_one_FE = feols(y ~ x1 | id, base_did)
#' est_two_FE = feols(y ~ x1 | id + period, base_did)
#' est_panel = feols(y ~ x1 | id + period, base_did, panel.id = ~id + period)
#'
#' etable(est_no_FE, est_one_FE, est_two_FE)
#'
#' # Changing the default standard-errors
#' setFixest_vcov(no_FE = "hetero", one_FE = "iid",
#'                two_FE = "twoway", panel = "drisc")
#' etable(est_no_FE, est_one_FE, est_two_FE, est_panel)
#'
#' # Resetting the defaults
#' setFixest_vcov(reset = TRUE)
#'
#'
setFixest_vcov = function(no_FE = "iid", one_FE = "cluster", two_FE = "cluster",
                          panel = "cluster", all = NULL, reset = FALSE){

  # NOTE:
  # The default values should ALWAYS be working.
  # That's why I don't allow conley SEs nor NW SEs
  # => they can be not working at times

  check_set_arg(no_FE,  "match(iid, hetero)")
  check_set_arg(one_FE, "match(iid, hetero, cluster)")
  check_set_arg(two_FE, "match(iid, hetero, cluster, twoway)")
  check_set_arg(panel,  "match(iid, hetero, cluster, DK, driscoll_kraay)")
  check_set_arg(all, "NULL match(iid, hetero)")
  check_set_arg(reset, "logical scalar")

  opts = getOption("fixest_vcov_default")
  if(is.null(opts) || !is.list(opts) || reset){
    opts = list(no_FE = "iid", one_FE = "cluster", two_FE = "cluster", panel = "cluster")
  }

  if(!is.null(all)){
    opts$no_FE = opts$one_FE = opts$two_FE = opts$panel = all
  }


  args = intersect(c("no_FE", "one_FE", "two_FE", "panel"), names(match.call()))

  for(a in args){
    opts[[a]] = eval(as.name(a))
  }

  options(fixest_vcov_default = opts)

}

#' @rdname setFixest_vcov
getFixest_vcov = function(){

  vcov_default = getOption("fixest_vcov_default")

  if(is.null(vcov_default)){
    vcov_default = list(no_FE = "iid", one_FE = "cluster", two_FE = "cluster", panel = "cluster")
    options(fixest_vcov_default = vcov_default)
    return(vcov_default)
  }

  vcov_default
}

Try the fixest package in your browser

Any scripts or data that you put into this service are public.

fixest documentation built on June 22, 2024, 9:12 a.m.