R/miscfuns.R

Defines functions collinearity

Documented in collinearity

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Sat Apr 23 15:35:53 2022
# ~: A few user-level + many internal funs
#----------------------------------------------#



#' Collinearity diagnostics for `fixest` objects
#'
#' In some occasions, the optimization algorithm of [`femlm`] may fail to converge, or 
#' the variance-covariance matrix may not be available. The most common reason of why 
#' this happens is collinearity among variables. This function helps to find out which 
#' set of variables is problematic.
#'
#'
#' @param x A `fixest` object obtained from, e.g. functions [`femlm`], [`feols`] or [`feglm`].
#' @param verbose An integer. If higher than or equal to 1, then a note is prompted at 
#' each step of the algorithm. By default `verbose = 0` for small problems 
#' and to 1 for large problems.
#'
#' @details
#' This function tests: 1) collinearity with the fixed-effect variables, 
#' 2) perfect multi-collinearity between the variables, 3) perfect multi-collinearity 
#' between several variables and the fixed-effects, and 4) identification issues 
#' when there are non-linear in parameters parts.
#'
#' @return
#' It returns a text message with the identified diagnostics.
#'
#' @author
#' Laurent Berge
#'
#' @examples
#'
#' # Creating an example data base:
#' set.seed(1)
#' fe_1 = sample(3, 100, TRUE)
#' fe_2 = sample(20, 100, TRUE)
#' x = rnorm(100, fe_1)**2
#' y = rnorm(100, fe_2)**2
#' z = rnorm(100, 3)**2
#' dep = rpois(100, x*y*z)
#' base = data.frame(fe_1, fe_2, x, y, z, dep)
#'
#' # creating collinearity problems:
#' base$v1 = base$v2 = base$v3 = base$v4 = 0
#' base$v1[base$fe_1 == 1] = 1
#' base$v2[base$fe_1 == 2] = 1
#' base$v3[base$fe_1 == 3] = 1
#' base$v4[base$fe_2 == 1] = 1
#'
#' # Estimations:
#'
#' # Collinearity with the fixed-effects:
#' res_1 = femlm(dep ~ log(x) + v1 + v2 + v4 | fe_1 + fe_2, base)
#' collinearity(res_1)
#'
#' # => collinearity with the first fixed-effect identified, we drop v1 and v2
#' res_1bis = femlm(dep ~ log(x) + v4 | fe_1 + fe_2, base)
#' collinearity(res_1bis)
#'
#' # Multi-Collinearity:
#' res_2 =  femlm(dep ~ log(x) + v1 + v2 + v3 + v4, base)
#' collinearity(res_2)
#'
#'
collinearity = function(x, verbose){
  # x: fixest estimation
  
  # todo: 
  # - input: fomrula + data to check collinearity in any kind of setup
  # - output: all the sets of problems and not only the first one
  #           find a way to format the output properly (should be informative)
  # - arg: focus: to focus on a single variable
  # - arg: collin.tol

  if(!inherits(x, "fixest")){
    stop("Argument `x` must be a fixest object.")
  }

  # I) (linear) collinearity with fixed-effects
  # II) (linear) multi collinearity
  # III) (non-linear) overidentification

  # flags
  isFixef = !is.null(x$fixef_vars)
  isFE = FALSE
  isSlope = FALSE
  if(isFixef){
    isSlope = !is.null(x$fixef_terms)
    if(isSlope){
      fixef_terms = x$fixef_terms
      slope_flag = grepl("\\[", fixef_terms)
      isFE = any(!slope_flag)
    } else {
      isFE = TRUE
    }
  }

  linear_fml = fml_split(formula(x, "linear"), 2, split.lhs = TRUE)

  rhs_fml = fml_split(formula(x, "linear"), 1)
  linear.varnames = all.vars(rhs_fml[[3]])

  isLinear = length(linear.varnames) > 0

  NL_fml = x$NL.fml
  isNL = !is.null(NL_fml)
  coef = x$coefficients

  # Getting the data
  data = fetch_data(x, "To apply function `collinearity`, ")

  if(is.matrix(data)){
    data = as.data.frame(data)
  } else {
    class(data) = "data.frame"
  }

  if(isFE){
    linear_fml = update(linear_fml, ~ . + 1)
  }

  # Panel setup
  panel__meta__info = set_panel_meta_info(x, data)

  if(isLinear || isFixef || "(Intercept)" %in% names(coef)){
    linear.matrix = fixest_model_matrix(rhs_fml, data)
  }

  for(i in seq_along(x$obs_selection)){
    linear.matrix = linear.matrix[x$obs_selection[[i]], , drop = FALSE]
  }

  if(isLinear){
    # We do that to drop interaction variables that should not be there any more
    # if factors with only NA values
    varkeep = intersect(c(names(x$coefficients), x$collin.var), colnames(linear.matrix))
    if(length(varkeep) < ncol(linear.matrix)){
      linear.matrix = linear.matrix[, varkeep, drop = FALSE]
    }
  }


  if(isLinear){
    # We center and scale to have something comparable across data sets
    first_row = linear.matrix[1, ]
    linear.matrix = scale(linear.matrix, center = FALSE, scale = TRUE)
    # The constants => we set it to 1
    constant_id = apply(linear.matrix, 2, anyNA)
    constant_value = first_row[constant_id]
    linear.matrix[is.na(linear.matrix)] = 1
    linear_mat_noIntercept = linear.matrix[, -1, drop = FALSE]
  }

  n_obs = nrow(data)
  Q = length(x$fixef_id)

  # information display
  ccat = function(...) NULL
  if(missing(verbose)){
    verbose = FALSE
    # verbose only if task is long
    if(100 * nrow(linear.matrix) * (ncol(linear.matrix)**2 * Q**2) >= 1e9){
      verbose = TRUE
    }
  }
  if(verbose) ccat = catma

  #
  # 0) Data preparation (FE/slopes)
  #

  # Variables for FE / Slopes
  if(isSlope){
    fixef_terms = x$fixef_terms
    terms_full = extract_fe_slope(fixef_terms)
    fixef_vars = terms_full$fixef_vars
    slope_fe = terms_full$slope_fe
    fe_all = terms_full$fe_all
    slope_vars = terms_full$slope_vars
    slope_terms = terms_full$slope_terms

    # dictionary mapping fe var names to the ids of id_dummies_vect
    dict_fe = 1:Q
    names(dict_fe) = x$fixef_vars
    slope_vars_unik = unique(slope_vars)

    if(any(!slope_vars_unik %in% names(data))){
      var_pblm = setdiff(slope_vars_unik, names(data))
      stopi("To check collinearity, we need to fetch some variables in the original dataset (",
           deparse_long(x$call$data), "). ",
           "However, the variable {$s, enum, is?var_pblm} not present in the original dataset any more.")
    }

    slope_var_list = list()
    for(i in 1:length(slope_vars_unik)){
      variable = all.vars(str2lang(slope_vars_unik[i]))

      # as.numeric => we'll use cpp so required
      svar = as.numeric(as.vector(eval(str2lang(slope_vars_unik[i]), data)))
      if(length(svar) == 1) svar = rep(svar, n_obs)

      slope_var_list[[slope_vars_unik[i]]] = svar
    }

    Q_slope = length(slope_fe)
    slope_fe_id = x$fixef_id[slope_fe]
    slope_var_all = slope_var_list[slope_vars]

    Q_fe = length(fixef_vars)
    fe_id = x$fixef_id[fixef_vars]

  } else if(isFE){
    Q_fe = length(x$fixef_id)
    fe_id = x$fixef_id
  }


  #
  # I) collinearity with FEs
  #

  ccat("Checking Collinearity: \n")

  #
  # Variables are constant or equal to 0
  #

  if(isLinear){

    if(isFE){
      if(any(constant_id[-1])){
        # var_problem = colnames(linear_mat_noIntercept)[is_const]
        var_problem = colnames(linear_mat_noIntercept)[constant_id[-1]]
        message = sma("The Variable{$s, enum.q, is ? var_problem} constant, thus collinear with the fixed-effects.")
        mema(message)
        return(invisible(message))
      }

    } else {

      isIntercept = attr(terms(x$fml),"intercept")

      if(isIntercept && any(constant_id[-1])){
        var_problem = colnames(linear_mat_noIntercept)[constant_id[-1]]
        message = sma("The variable{$s, enum.q, is ? var_problem} constant, thus collinear with the intercept.")
        mema(message)
        return(invisible(message))
      }

      if(any(constant_value == 0)){
        # constant and equal to 0
        var_problem = colnames(linear.matrix)[first_row == 0 & constant_id]
        message = sma("The variable{$s, enum.q, is ? var_problem} constant and equal to 0.")

        mema(message)
        return(invisible(message))
      }
    }
  }

  if(isFE && isLinear){
    ccat("- simple with fixed-effects:")
    # We project each variable onto the cluster subspace

    cluster = fe_id
    Q_fe = length(cluster)
    for(q in 1:Q_fe){
      ccat(".")
      dum = cluster[[q]]
      k = max(dum)

      value = cpp_tapply_sum(Q = k, x = linear_mat_noIntercept, dum = dum)
      nb_per_cluster = cpp_table(Q = k, dum = dum)

      # residuals of the linear projection on the cluster space
      residuals = linear_mat_noIntercept - (value/nb_per_cluster)[dum, ]

      max_residuals = apply(abs(residuals), 2, max)

      if(any(max_residuals < 1e-6)){
        ccat("\n")
        varnames = colnames(linear_mat_noIntercept)
        collin_var = varnames[max_residuals < 1e-6]
        message = sma("The variable{$s, enum.q, is ? collin_var} collinear with the fixed-effects `", names(cluster)[q], "`.")

        mema(message)
        return(invisible(message))

      }

    }
    ccat("OK\n")
  }

  if(isSlope && isLinear){
    ccat("- simple with variables with varying slopes:")

    for(q in 1:Q_slope){
      ccat(".")
      dum = fe_id[[q]]
      my_var = slope_var_all[[q]]
      k = max(dum)
      value = cpp_tapply_sum(Q = k, x = linear_mat_noIntercept*my_var, dum = dum)

      denom = as.vector(tapply(my_var**2, dum, sum))

      # residuals of the linear projection on the cluster space
      residuals = linear_mat_noIntercept - (value/denom)[dum, ]*my_var

      max_residuals = apply(abs(residuals), 2, max)

      if(any(max_residuals < 1e-6)){
        ccat("\n")
        varnames = colnames(linear_mat_noIntercept)
        collin_var = varnames[max_residuals < 1e-6]

        message = sma("The variable{$s, enum.q, is ? collin_var} collinear with variables with varying slope `", slope_vars[q], "` (on `", slope_fe[q], "`).")

        mema(message)
        return(invisible(message))

      }

    }
    ccat("OK\n")
  }

  #
  # II) perfect multicollinearity
  #

  name2change = grepl("[^[:alnum:]\\._]", colnames(linear.matrix))
  dict_name = colnames(linear.matrix)[!grepl("(Intercept)", colnames(linear.matrix))]
  if(any(name2change)){
    linbase = as.data.frame(linear.matrix)
    names(linbase)[name2change] = gsub("[^[:alnum:]\\._]", "_", names(linbase)[name2change])
  } else {
    linbase = as.data.frame(linear.matrix)
  }
  linearVars = names(linbase)[!grepl("Intercept", names(linbase))]
  names(dict_name) = linearVars
  dict_name["_Intercept_"] = "(Intercept)"

  # for multicol without cluster
  mat_base = as.matrix(linbase)

  # we add possibly missing variables
  varmiss = setdiff(all.vars(linear_fml), names(linbase))
  for(v in varmiss) linbase[[v]] = data[[v]]

  if(isLinear && length(linearVars) >= 2){
    ccat("- multiple:")
    for(v in linearVars){
      ccat(".")

      i = which(colnames(mat_base) == v)
      res = ols_fit(y = mat_base[, i], X = mat_base[, -i, drop = FALSE], 
                    w = 1, collin.tol = 1e-10, nthreads = 1)

      max_residuals = max(abs(res$residuals))

      if(max_residuals < 1e-4){
        ccat("\n")
        coef_lm = res$coefficients
        vars = colnames(mat_base)[-i]
        collin_var = vars[!is.na(coef_lm) & abs(coef_lm) > 1e-6]
        message = sma("The variable {bq?dict_name[v]} is collinear with the variable{$s, enum.q ? dict_name[collin_var]}.")

        mema(message)
        return(invisible(message))
      }
    }
    ccat("OK\n")
  }

  #
  # II.b) perfect multicollinearity + fixed-effects
  #

  if(isLinear && length(linearVars) >= 2 && isFixef){
    ccat("- multiple with fixed-effects")

    dum_names = names(x$fixef_id)
    n_clust = length(dum_names)
    new_dum_names = paste0("DUM_", 1:n_clust)
    for(i in 1:n_clust){
      linbase[[paste0("DUM_", i)]] = x$fixef_id[[i]]
    }
    
    for(v in linearVars){
      ccat(".")
      fml2estimate = as.formula(paste0(v, "~", paste0(setdiff(linearVars, v), collapse = "+")))

      for(id_cluster in 1:n_clust){
        
        res = feols(fml2estimate, linbase, fixef = new_dum_names[id_cluster], warn = FALSE)

        max_residuals = max(abs(res$residuals))

        if(max_residuals < 1e-4){
          ccat("\n")
          coef_lm = coef(res)
          collin_var = names(coef_lm)[!is.na(coef_lm) & abs(coef_lm) > 1e-6]
          message = sma("The variable {bq?dict_name[v]} is collinear with variable{$s, enum.q ? dict_name[collin_var]}, together with the fixed-effects {bq ? dum_names[id_cluster]}.")

          mema(message)
          return(invisible(message))
        }
      }
    }
    ccat("OK\n")
  }

  #
  # III) NL problem
  #

  if(isNL){
    NL_vars = all.vars(NL_fml)
    varNotHere = setdiff(NL_vars, c(names(coef), names(data)))
    if(length(varNotHere) > 0){
      stopi("Some variables used to estimate the model (in the non-linear formula) are missing from the original data: {enum.q ? varNotHere}.")
    }

    var2send = intersect(NL_vars, names(data))
    env = new.env()
    for(var in var2send){
      assign(var, data[[var]], env)
    }
  }

  if(isNL && (length(coef) >= 2 || isFixef)){
    ccat(", in non-linear term:")
    if(isFixef){
      # we add the constant
      coef["CONSTANT"] = 1
      data$CONSTANT = 1
      linear.matrix = model.matrix(update(linear_fml, ~ . -1 + CONSTANT), data)
    }

    coef_name = names(coef)

    NL_coef = setdiff(all.vars(NL_fml), names(data))
    # L_coef = setdiff(coef_name, NL_coef)
    L_coef = colnames(linear.matrix)

    #
    # We compute mu:
    #

    mu = 0
    if(length(L_coef) > 0){
      mu = linear.matrix %*% coef[L_coef]
    }

    for(iter_coef in NL_coef){
      assign(iter_coef, coef[iter_coef], env)
    }

    # Evaluation of the NL part
    value_NL = eval(NL_fml[[2]], env)
    mu = mu + value_NL
    data$mu = mu

    if(var(mu) == 0){
      message = "Variance of the NL part is 0."
      mema(message)
      return(invisible(message))
    }

    #
    # The loop
    #

    for(var in coef_name){
      ccat(".")
      # we modify the coef of var, then we fix it:
      # if we can find a set of other parameters that give the same fit,
      # then there is an identification issue

      if(var %in% L_coef){
        # if linear coef: we modify the fml and add an offset
        if(length(L_coef) == 1){
          fml = mu ~ 0
        } else {
          fml = as.formula(paste0("mu ~ 0+", paste0(setdiff(L_coef, var), collapse = "+")))
        }

        # offset with the new value
        offset = as.formula(paste0("~", coef[var] + 1, "*", var))

        res = feNmlm(fml, data = data, family = "gaussian", NL.fml = NL_fml, 
                     NL.start = as.list(coef), offset = offset)

      } else {
        # NL case:
        # we modify both fml and NL.fml

        if(isFixef){
          fml = update(x$fml, mu ~ . - 1 + CONSTANT)
        } else {
          fml = update(x$fml, mu ~ .)
        }

        # the new formula with the new variable
        NL_fml_char = as.character(NL_fml)[2]
        NL_fml_new = as.formula(paste0("~", gsub(paste0("\\b", var, "\\b"), 1 + coef[var], NL_fml_char)))

        if(length(NL_coef) == 1){
          # there is no more parameter to estimate => offset
          res = femlm(fml, data = data, family = "gaussian", offset = NL_fml_new)
        } else {
          res = feNmlm(fml, data = data, family = "gaussian", NL.fml = NL_fml_new, 
                       NL.start = as.list(coef))
        }

      }

      sum_resids = sum(abs(res$residuals))
      if(sum_resids < 1e-4){
        coef_esti = coef(res)
        coef_diff = abs(coef_esti - coef[names(coef_esti)])
        collin_var = names(coef_diff)[coef_diff > 1e-3]
        message = paste0("Coefficients ", show_vars_limited_width(c(var, collin_var)), " are not uniquely identifed.")

        mema(message)
        return(invisible(message))
      }

    }
    ccat("OK\n")
  }

  message = "No visible collinearity problem. (Doesn't mean there's none!)"
  mema(message)
  return(invisible(message))

}



#' Treated and control sample descriptives
#'
#' This function shows the means and standard-deviations of several variables conditional on whether they are from the treated or the control group. The groups can further be split according to a pre/post variable. Results can be seamlessly be exported to Latex.
#'
#'
#' @param fml Either a formula of the type `var1 + ... + varN ~ treat` or 
#' `var1 + ... + varN ~ treat | post`. Either a data.frame/matrix containing 
#' all the variables for which the means are to be computed (they must be numeric of course).
#'  Both the treatment and the post variables must contain only exactly two values. 
#' You can use a point to select all the variables of the data set: `. ~ treat`.
#' @param base A data base containing all the variables in the formula `fml`.
#' @param treat_var Only if argument `fml` is *not* a formula. The vector identifying 
#' the treated and the control observations (the vector can be of any type but must 
#' contain only two possible values). Must be of the same length as the data.
#' @param post_var Only if argument `fml` is *not* a formula. The vector identifying 
#' the periods (pre/post) of the observations (the vector can be of any type but 
#' must contain only two possible values). The first value (in the sorted sense) 
#' of the vector is taken as the pre period. Must be of the same length as the data.
#' @param treat_first Which value of the 'treatment' vector should appear on the left? 
#' By default the max value appears first (e.g. if the treatment variable is a 0/1 vector, 
#' 1 appears first).
#' @param tex Should the result be displayed in Latex? Default is `FALSE`. Automatically 
#' set to `TRUE` if the table is to be saved in a file using the argument `file`.
#' @param treat_dict A character vector of length two. What are the names of the treated 
#' and the control? This should be a dictionary: e.g. `c("1"="Treated", "0" = "Control")`.
#' @param dict A named character vector. A dictionary between the variables names and an alias. 
#' For instance `dict=c("x"="Inflation Rate")` would replace the variable name `x` 
#' by \dQuote{Inflation Rate}.
#' @param file A file path. If given, the table is written in Latex into this file.
#' @param replace Default is `TRUE`, which means that when the table is exported, the 
#' existing file is not erased.
#' @param title Character string giving the Latex title of the table. (Only if exported.)
#' @param label Character string giving the Latex label of the table. (Only if exported.)
#' @param raw Logical, default is `FALSE`. If `TRUE`, it returns the information without formatting.
#' @param indiv Either the variable name of individual identifiers, a one sided formula, 
#' or a vector. If the data is that of a panel, this can be used to track the number 
#' of individuals per group.
#' @param prepostnames Only if there is a 'post' variable. The names of the pre and post 
#' periods to be displayed in Latex. Default is `c("Before", "After")`.
#' @param diff.inv Logical, default to `FALSE`. Whether to inverse the difference.
#'
#' @details
#' By default, when the user tries to apply this function to nun-numeric variables, an error is raised. The exception is when the all variables are selected with the dot (like in `. ~ treat`. In this case, non-numeric variables are automatically omitted (with a message).
#'
#' NAs are removed automatically: if the data contains NAs an information message will be prompted. First all observations containing NAs relating to the treatment or post variables are removed. Then if there are still NAs for the variables, they are excluded separately for each variable, and a new message detailing the NA breakup is prompted.
#'
#' @return
#' It returns a data.frame or a Latex table with the conditional means and statistical differences between the groups.
#'
#' @examples
#'
#' # Playing around with the DiD data
#' data(base_did)
#'
#' # means of treat/control
#' did_means(y+x1+period~treat, base_did)
#'
#' # same but inverting the difference
#' did_means(y+x1+period~treat, base_did, diff.inv = TRUE)
#'
#' # now treat/control, before/after
#' did_means(y+x1+period~treat|post, base_did)
#'
#' # same but with a new line giving the number of unique "indiv" for each case
#' did_means(y+x1+period~treat|post, base_did, indiv = "id")
#'
#' # same but with the treat case "0" coming first
#' did_means(y+x1+period~treat|post, base_did, indiv = ~id, treat_first = 0)
#'
#' # Selecting all the variables with "."
#' did_means(.~treat|post, base_did, indiv = "id")
#'
#'
did_means = function(fml, base, treat_var, post_var, tex = FALSE, treat_dict,
                     dict = getFixest_dict(), file, replace = FALSE, title,
                     label, raw = FALSE, indiv, treat_first, 
                     prepostnames = c("Before", "After"), diff.inv = FALSE){
  # x is a data.frame
  # treat_vector is a list of IDs
  # treat_first: the treated-value to appear first

  # Ajouter protection quand un des groupes vaut 0!!!
  # ie un des treat-post n'a pas d'observation

  fml_in = fml

  #
  # Nber of units of observations (I do it before the main data)
  #

  IS_INDIV = FALSE
  if(!missing(indiv)){
    IS_INDIV = TRUE
    if("formula" %in% class(indiv)){

      check_arg(indiv, "os formula")
      indiv_varname = attr(terms(indiv), "term.labels")

      if(missing(base)){
        stop("To use `indiv` as a formula, you must provide the argument `base`.")
      }

      if(length(indiv_varname) > 1){
        stop("The argument `indiv` must refer to only one variable.")
      }

      if(!all(all.vars(indiv) %in% names(base))){
        pblm = setdiff(all.vars(indiv), names(base))
        stop("In argument `indiv`: the variable", enumerate_items(pblm, "s.is")," not in the data set.")
      }

      indiv_var = try(eval(str2lang(indiv_varname), base), silent = TRUE)
      if("try-error" %in% class(indiv_var)){
        stop("Evaluation of `indiv` raises and error:\n", indiv_var)
      }
    } else if(length(indiv) == 1 && is.character(indiv)){
      indiv_varname = indiv

      if(missing(base) || !indiv %in% names(base)){
        stop("To use `indiv` as a `character string` you must provide the argument `base`.")
      } else {
        indiv_var = base[[indiv]]
      }
    } else {
      if(!missing(base) && length(indiv) != NROW(base)){
        stop("The length of `indiv` must be the same as the data.")
      } else if(missing(base) && length(indiv) != NROW(fml)){
        stop("The length of `indiv` must be the same as the data.")
      }
      indiv_var = indiv
    }
  } else {
    indiv_var = NULL
  }

  #
  # Extracting the information
  #

  usePost = FALSE
  if("formula" %in% class(fml_in)){
    if(missing(base) || !is.data.frame(base)){
      stop("If you provide a formula, a data.frame must be given in argument `base`.")
    }

    # Creation of x and the condition
    if(!length(fml_in) == 3){
      stop("The formula must be of the type `var1 + ... + varN ~ treat` or `var1 + ... + varN ~ treat | post`.")
    }

    fml_parts = fml_split(fml_in)

    fml = fml_parts[[1]]
    pipe = NULL
    if(length(fml_parts) > 1){
      pipe = fml_parts[[2]]
    }

    #
    # Treat & post
    #

    if(!(all(all.vars(fml[[3]]) %in% names(base)))){
      pblm = setdiff(all.vars(fml[[3]]), names(base))
      stop("In the evaluation of the treatment variable: ", enumerate_items(pblm, "is"), " not in the data set.")
    }

    treat_var = try(eval(fml[[3]], base), silent = TRUE)
    if("try-error" %in% class(treat_var)){
      stop("Evaluation of the `treatment` variable raises and error: \n", treat_var)
    }

    if(!is.null(pipe)){
      if(!(all(all.vars(pipe) %in% names(base)))){
        pblm = setdiff(all.vars(pipe), names(base))
        stop("In the evaluation of the `post` variable: ", enumerate_items(pblm, "is"), " not in the data set.")
      }

      post_var = try(eval(pipe[[2]], base), silent = TRUE)
      if("try-error" %in% class(post_var)){
        stop("Evaluation of the `post` variable raises and error: \n", treat_var)
      }

      usePost = TRUE
    }

    #
    # The variables
    #

    # special case: the point
    if(deparse(fml[[2]])[1] == "."){
      all_vars = setdiff(names(base), deparse(fml[[3]])[1])
      if(usePost) all_vars = setdiff(all_vars, as.character(pipe))
      if(IS_INDIV) all_vars = setdiff(all_vars, indiv_varname)

      if("data.table" %in% class(base)){
        mat_vars = as.data.frame(base[, all_vars, with = FALSE])
      } else {
        mat_vars = base[, all_vars, FALSE]
      }

      # we drop non-numeric info + note
      base_small = head(mat_vars, 10)
      is_num = sapply(base_small, function(x) is.numeric(x) || is.logical(x))
      pblm = all_vars[!is_num]

      if(all(!is_num)){
        stop("Function cannot be performed: not any numeric variable.")
      } else if(length(pblm) > 0){
        message("NOTE: The variable", enumerate_items(pblm, "s.is"), " removed because they are non-numeric.")
      }

      mat_vars = as.matrix(mat_vars[, is_num, FALSE])
    } else {
      # we swap the formula to use model.frame
      fml_x = as.formula(paste0("1~", deparse_long(fml[[2]]), "-1"))

      # Variables control:
      all_vars = all.vars(fml_x)
      pblm = setdiff(all_vars, names(base))
      if(length(pblm) > 0){
        stop("The variable", enumerate_items(pblm, "s.is"), " not in the data set (", deparse_short(substitute(base)), ").")
      }

      # Evaluation control
      base_small = head(base, 10)
      var2eval = attr(terms(fml_x), "term.labels")
      var2eval = gsub(":", "*", var2eval)
      for(i in seq_along(var2eval)){
        var = var2eval[i]
        x_small = try(eval(parse(text=var), base_small), silent = TRUE)
        if("try-error" %in% class(x_small)){
          stop("Evaluation of the variable `", var, "` raises and error:\n", x_small)
        }

        if(!is.numeric(x_small) && !is.logical(x_small)){
          stop("The variable ", var, " is not numeric, please provide only numeric variables.")
        }
      }

      mat_vars = prepare_matrix(fml_x, base)
    }

    if(is.logical(mat_vars)){
      mat_vars = as.numeric(mat_vars)
    }

    # other info
    x_name = colnames(mat_vars)
  } else {
    if(missing(treat_var)){
      stop("If argument `fml` is not a formula, you must provide the argument `treat_var`.")
    } else {
      mat_vars = fml_in

      if(NROW(mat_vars) != length(treat_var)){
        stop("The arguments `x` and `treat_var` must be of the same length.")
      }

      if(!missing(post_var) && !is.null(post_var)){
        if(NROW(mat_vars) != length(post_var)){
          stop("If provided, the argument `post_var` must be of the same length of `x`.")
        }
        usePost = TRUE
      }

      # We exclude non numeric variables
      if("data.frame" %in% class(mat_vars)){
        is_num = sapply(mat_vars, function(x) is.numeric(x) || is.logical(x))
        if(any(!is_num)){
          pblm = names(mat_vars)[!is_num]
          stop("This function works only for numeric variables. Variable", enumerate_items(pblm, "s.is"), " not numeric.")
        }
        mat_vars = as.matrix(mat_vars)
      } else if(!is.matrix(mat_vars)){
        stop("If not a formula, argument `fml` must be a data.frame or a matrix. Currently it is of class ", class(mat_vars)[1], ".")
      } else if(!is.numeric(mat_vars) && !is.logical(mat_vars)){
        stop("If not a formula, argument `fml` must be a data.frame or a matrix with numeric variables. Currenlty its is not numeric.")
      }

      if(is.logical(mat_vars)) mat_vars = as.numeric(mat_vars)

      # other info
      x_name = colnames(mat_vars)
    }
  }

  #
  # NA control
  #

  # Treat & post
  ANY_NA = FALSE
  if(anyNA(treat_var) || (usePost && anyNA(post_var))){
    ANY_NA = TRUE
    qui_NA = is.na(treat_var)
    if(usePost) qui_NA = qui_NA | is.na(post_var)

    if(all(qui_NA)){
      msg = "treatment variable."
      if(usePost && anyNA(post_var)){
        if(anyNA(treat_var)){
          msg = "`treatment` and `post` variables."
        } else {
          msg = "`post` variable."
        }
      }
      stop("All observation contain NA values for the ", msg)
    }

    mat_vars = mat_vars[!qui_NA, ]
    treat_var = treat_var[!qui_NA]
    if(usePost) post_var = post_var[!qui_NA]

    if(IS_INDIV) indiv_var = indiv_var[!qui_NA]

    msg = ifelse(usePost, "/post variables.", " variable.")
    message("NOTE: ", sum(qui_NA), " observations removed because of NA values in the treatment", msg)

  }



  # Treatment / post controls
  if(length(unique(treat_var)) != 2){
    n = length(unique(treat_var))
    msg = ifelse(n == 1, "only one value.", paste0(n, " values."))
    stop("This function supports only 2 conditional values for the treatment variable. Currently, it contains ", msg)
  }

  if(usePost && length(unique(post_var)) != 2){
    n = length(unique(post_var))
    msg = ifelse(n == 1, "only one value.", paste0(n, " values."))
    stop("This function supports only 2 conditional values for the `post` variable. Currently, it contains ", msg)
  }

  if(usePost){
    check_arg(prepostnames, "character vector len(2) no na")
  }

  treat_cases = sort(unique(treat_var), decreasing = TRUE)
  if(!missing(treat_dict) && !is.null(treat_dict)){

    if(!isVector(treat_dict) || is.null(names(treat_dict))){
      stop("The argument `treat_dict` must be a named character vector.")
    }

    pblm = setdiff(treat_cases, names(treat_dict))
    if(length(pblm) > 0){
      stop("The value", enumerate_items(pblm, "s.is"), " not in `treat_dict`.")
    }
  } else {
    treat_dict = paste0("cond: ", treat_cases)
    names(treat_dict) = treat_cases
  }

  #
  # The main function
  #

  if(!missing(treat_first) && !is.null(treat_first)){
    if(!treat_first %in% treat_cases){
      stop("Argument `treat_first` must be an element of the treated variable.")
    } else if(treat_first != treat_cases[1]){
      treat_cases = rev(treat_cases)
    }
  }

  if(!usePost){
    # the simple case
    res = .meanDiffTable(mat_vars, treat_var, treat_cases, indiv_var, raw = raw, 
                         diff.inv = diff.inv)

    if(any(attr(res, "na"))){
      nb_na = attr(res, "na")
      var_with_na = x_name[nb_na > 0]
      message("NA values were omitted for the variable", enumerate_items(paste0(var_with_na, " (", nb_na[nb_na > 0], ")"), "s"), ".")
    }

    attr(res, "na") = NULL


  } else {
    # the more complex case
    post_values = sort(unique(post_var))

    qui_pre = which(post_var == post_values[1])
    res_pre = .meanDiffTable(mat_vars[qui_pre, ], treat_var[qui_pre], treat_cases, 
                             indiv_var[qui_pre], raw = raw, diff.inv = diff.inv)

    qui_post = which(post_var == post_values[2])
    res_post = .meanDiffTable(mat_vars[qui_post, ], treat_var[qui_post], treat_cases, 
                              indiv_var[qui_post], raw = raw, diff.inv = diff.inv)

    nb_na = attr(res_pre, "na") + attr(res_post, "na")

    res = cbind(res_pre, res_post[, -1])

    if(any(nb_na)){
      var_with_na = x_name[nb_na > 0]
      message("NA values were omitted for the variable", enumerate_items(paste0(var_with_na, " (", nb_na[nb_na > 0], ")"), "s"), ".")
    }

    attr(res, "na") = NULL
  }

  #
  # Tex exportation
  #

  if(tex || !missing(file)){
    # we sink!
    if(!missing(file)){
      sink(file = file, append = (replace == FALSE))
      on.exit(sink())
    }

    if(!missing(title)){
      # if there is the title, then we create a "table" environment
      myTitle = title
      if(!missing(label)) myTitle = paste0("\\label{", label, "} ", myTitle)
      cat("\\begin{table}[htbp]\\centering\n\\caption{",  myTitle, "}\n")
    }

    #
    # we write the tex code
    #

    treat_text = paste0(treat_dict[as.character(treat_cases)], collapse = " & ")

    if(!usePost){
      # The simple case
      cat("\\begin{tabular}{lcccc}\n")
      cat(" &  &  &  & \\tabularnewline\n\\hline\n\\hline\n")

      cat(" & ", treat_text,  "\\tabularnewline\n")

      # the variables "line"
      cat("Variables & mean (s.e.) & mean (s.e.) & Diff. & t-stat \\tabularnewline\n")
      cat("\\hline\n")
    } else {
      # The case with pre and post
      cat("\\begin{tabular}{lcccccccc}\n")
      cat(" &  &  &  &  &  &  &  & \\tabularnewline\n\\hline\n\\hline\n")

      # the treat post line
      cat(" & ", "\\multicolumn{4}{c}{", prepostnames[1], "} & \\multicolumn{4}{c}{", 
          prepostnames[2], "}\\tabularnewline\n")

      # the 'treated' line
      cat(" & ", treat_text, " &  &  & ", treat_text, "\\tabularnewline\n")

      # the variables "line"
      cat("Variables & mean (s.e.) & mean (s.e.) & Diff. & t-stat & mean (s.e.) & mean (s.e.) & Diff. & t-stat \\tabularnewline\n")
      cat("\\hline\n")
    }


    # Changing the names of the variables
    if(!missnull(dict)){
      qui = which(res$vars %in% names(dict))
      for(i in qui) res$vars[i] = escape_latex(dict[res$vars[i]])
    }

    # The data
    for(i in 1:nrow(res)){
      cat(paste0(res[i,], collapse = " & "), "\\tabularnewline\n")
    }

    cat("\\hline\n\\hline\n &  &  &  & \\tabularnewline\n")
    cat("\\end{tabular}\n")

    if(!missing(title)) cat("\\end{table}\n")

    return(invisible(res))
  }

  res
}

.meanDiffTable = function(mat_vars, treat_var, treat_cases, indiv_var, raw, diff.inv = FALSE){
  # This function is the workhorse of did_means

  x_name = colnames(mat_vars)

  treat_01 = 1 * (treat_var == treat_cases[2])

  nthreads = getFixest_nthreads()
  info = cpp_cond_means(mat_vars, treat_01, nthreads)

  means = info$means
  sds = info$sd
  ns = info$n
  n_01 = info$n_01

  name_1 = as.character(treat_cases[1])
  name_2 = as.character(treat_cases[2])

  means_1 = means[, 1]
  means_2 = means[, 2]

  sds_1 = sds[, 1]
  sds_2 = sds[, 2]

  n_1 = ns[, 1]
  n_2 = ns[, 2]

  format_1 = paste0(mysignif(means_1, 2), " (", mysignif(sds_1, 2), ")")
  format_2 = paste0(mysignif(means_2, 2), " (", mysignif(sds_2, 2), ")")

  if(diff.inv){
    D = (means_2 - means_1)
  } else {
    D = (means_1 - means_2)
  }

  sd_D = sqrt(sds_1**2/n_1 + sds_2**2/n_2)

  t_stat = signif(D/sd_D, 3)

  #
  # Formatting
  #

  if(raw){
    res = data.frame(vars = x_name, stringsAsFactors = FALSE)
    res[[paste0("Mean: ", treat_cases[1])]] = c(means_1, recursive = TRUE)
    res[[paste0("Mean: ", treat_cases[2])]] = c(means_2, recursive = TRUE)
    res[[paste0("se: ", treat_cases[1])]] = c(sds_1, recursive = TRUE)
    res[[paste0("se: ", treat_cases[2])]] = c(sds_2, recursive = TRUE)
    res[["Difference"]] = c(D, recursive=TRUE)
    res[["t-stat"]] = c(as.character(t_stat), recursive=TRUE)
  } else {
    res = data.frame(vars = x_name, stringsAsFactors = FALSE)
    res[[paste0("cond: ", treat_cases[1])]] = format_1
    res[[paste0("cond: ", treat_cases[2])]] = format_2
    res[["Difference"]] = c(mysignif(D, 3), recursive = TRUE)
    res[["t-stat"]] = c(as.character(t_stat), recursive = TRUE)

    res[nrow(res) + 1, ] = c("Observations", addCommas(n_01[1]), addCommas(n_01[2]), "", "")
    # Le nombre d'individus
    if(!missing(indiv_var) && !is.null(indiv_var)){
      nb_indiv =  tapply(indiv_var, treat_var, function(x) length(unique(x)))
      res[nrow(res) + 1, ] = c("# Individuals", addCommas(nb_indiv[name_1]), addCommas(nb_indiv[name_2]), "", "")
    }
  }

  attr(res, "na") = info$na

  res
}



#' Create, or interact variables with, factors
#'
#' Treat a variable as a factor, or interacts a variable with a factor. Values to 
#' be dropped/kept from the factor can be easily set. Note that to interact 
#' fixed-effects, this function should not be used: instead use directly the syntax `fe1^fe2`.
#'
#'
#' @inheritParams bin
#'
#' @param factor_var  A vector (of any type) that will be treated as a factor. 
#' You can set references (i.e. exclude values for which to create dummies) with 
#' the `ref` argument.
#' @param var A variable of the same length as `factor_var`. This variable will be 
#' interacted with the factor in `factor_var`. It can be numeric or factor-like. 
#' To force a numeric variable to be treated as a factor, you can add the `i.` 
#' prefix to a variable name. For instance take a numeric variable `x_num`: 
#' `i(x_fact, x_num)` will treat `x_num` as numeric while `i(x_fact, i.x_num)` 
#' will treat `x_num` as a factor (it's a shortcut to `as.factor(x_num)`).
#' @param ref A vector of values to be taken as references from `factor_var`. 
#' Can also be a logical: if `TRUE`, then the first value of `factor_var` will be removed.
#'  If `ref` is a character vector, partial matching is applied to values; 
#' use "@" as the first character to enable regular expression matching. See examples.
#' @param keep A vector of values to be kept from `factor_var` (all others are dropped). 
#' By default they should be values from `factor_var` and if `keep` is a 
#' character vector partial matching is applied. Use "@" as the first character 
#' to enable regular expression matching instead.
#' @param ref2 A vector of values to be dropped from `var`. By default they 
#' should be values from `var` and if `ref2` is a character vector partial matching is applied. 
#' Use "@" as the first character to enable regular expression matching instead.
#' @param keep2 A vector of values to be kept from `var` (all others are dropped). 
#' By default they should be values from `var` and if `keep2` is a character vector 
#' partial matching is applied. Use "@" as the first character 
#' to enable regular expression matching instead.
#' @param bin2 A list or vector defining the binning of the second variable.
#'  See help for the argument `bin` for details (or look at the help of the function [`bin`]). 
#' You can use `.()` for `list()`.
#' @param ... Not currently used.
#'
#' @details
#' To interact fixed-effects, this function should not be used: instead use directly the syntax
#'  `fe1^fe2` in the fixed-effects part of the formula. Please see the details and
#'  examples in the help page of [`feols`].
#'
#' @return
#' It returns a matrix with number of rows the length of `factor_var`. If there is no interacted 
#' variable or it is interacted with a numeric variable, the number of columns is equal to the 
#' number of cases contained in `factor_var` minus the reference(s). If the interacted variable is 
#' a factor, the number of columns is the number of combined cases between `factor_var` and `var`.
#'
#' @author
#' Laurent Berge
#'
#' @seealso
#' [`iplot`][fixest::coefplot] to plot interactions or factors created with `i()`, [`feols`] for 
#' OLS estimation with multiple fixed-effects.
#'
#' See the function [`bin`] for binning variables.
#'
#' @examples
#'
#' #
#' # Simple illustration
#' #
#'
#' x = rep(letters[1:4], 3)[1:10]
#' y = rep(1:4, c(1, 2, 3, 4))
#'
#' # interaction
#' data.frame(x, y, i(x, y, ref = TRUE))
#'
#' # without interaction
#' data.frame(x, i(x, "b"))
#'
#' # you can interact factors too
#' z = rep(c("e", "f", "g"), c(5, 3, 2))
#' data.frame(x, z, i(x, z))
#'
#' # to force a numeric variable to be treated as a factor: use i.
#' data.frame(x, y, i(x, i.y))
#'
#' # Binning
#' data.frame(x, i(x, bin = list(ab = c("a", "b"))))
#'
#' # Same as before but using .() for list() and a regular expression
#' # note that to trigger a regex, you need to use an @ first
#' data.frame(x, i(x, bin = .(ab = "@a|b")))
#'
#' #
#' # In fixest estimations
#' #
#'
#' data(base_did)
#' # We interact the variable 'period' with the variable 'treat'
#' est_did = feols(y ~ x1 + i(period, treat, 5) | id + period, base_did)
#'
#' # => plot only interactions with iplot
#' iplot(est_did)
#'
#' # Using i() for factors
#' est_bis = feols(y ~ x1 + i(period, keep = 3:6) + i(period, treat, 5) | id, base_did)
#'
#' # we plot the second set of variables created with i()
#' # => we need to use keep (otherwise only the first one is represented)
#' coefplot(est_bis, keep = "trea")
#'
#' # => special treatment in etable
#' etable(est_bis, dict = c("6" = "six"))
#'
#' #
#' # Interact two factors
#' #
#'
#' # We use the i. prefix to consider week as a factor
#' data(airquality)
#' aq = airquality
#' aq$week = aq$Day %/% 7 + 1
#'
#' # Interacting Month and week:
#' res_2F = feols(Ozone ~ Solar.R + i(Month, i.week), aq)
#'
#' # Same but dropping the 5th Month and 1st week
#' res_2F_bis = feols(Ozone ~ Solar.R + i(Month, i.week, ref = 5, ref2 = 1), aq)
#'
#' etable(res_2F, res_2F_bis)
#'
#' #
#' # Binning
#' #
#'
#' data(airquality)
#'
#' feols(Ozone ~ i(Month, bin = "bin::2"), airquality)
#'
#' feols(Ozone ~ i(Month, bin = list(summer = 7:9)), airquality)
#'
#'
#'
i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){
  # Used to create interactions

  # Later: binning (bin = 1:3 // bin = list("a" = "[abc]")). Default name is bin name (eg "1:3")

  # gt = function(x) cat(sfill(x, 20), ": ", -(t0 - (t0<<-proc.time()))[3], "s\n", sep = "")
  # t0 = proc.time()

  validate_dots(valid_args = c("f2", "f_name", "ref_special", "sparse"))

  dots = list(...)
  is_sparse = isTRUE(dots$sparse)

  mc = match.call()

  # Finding if it's a call from fixest
  FROM_FIXEST = is_fixest_call()

  # General checks
  check_arg(factor_var, "mbt vector")

  # NOTA:
  # the user can use the prefix "i." to tell the algorithm to consider the
  # variable var as a factor. This requires a non standard evaluation
  #

  var_name = NULL
  if(!is.null(mc$f2)){
    # should be only used internally

    var_name = deparse_long(mc$f2)
    var = dots$f2
    check_value(var, "vector", .arg_name = "f2")
  }

  # General information on the factor variable
  if(!is.null(dots$f_name)){
    f_name = dots$f_name
  } else {
    f_name = deparse_long(mc$factor_var)
  }
  f = factor_var # renaming for clarity

  # checking var
  IS_INTER_NUMERIC = IS_INTER_FACTOR = FALSE
  if(!missing(var)){
    # Evaluation of var (with possibly, user prepending with i.)
    if(is.null(var_name)){
      var_name = deparse_long(mc$var)
      if(grepl("^i\\.", var_name)){
        var_name = gsub("^i\\.", "", var_name)
        var = str2lang(var_name)
        check_set_value(var, "evalset vector", .data = parent.frame(), .arg_name = "var")
        IS_INTER_FACTOR = TRUE
      } else {
        check_value(var, "vector", .arg_name = "var")
      }
    } else {
      # f2 was used
      IS_INTER_FACTOR = TRUE
    }

    # is it an interaction with a factor or with a continuous variable?
    if(length(var) == length(f)){
      # Logical => numeric by default
      # otherwise, just setting the flags

      if(IS_INTER_FACTOR == FALSE){
        # If previous condition == TRUE, means was requested by the user

        if(is.logical(var)){
          var = as.numeric(var)
          IS_INTER_NUMERIC = TRUE
        } else if(is.numeric(var)){
          IS_INTER_NUMERIC = TRUE
        } else {
          IS_INTER_FACTOR = TRUE
        }
      }

    } else if(length(var) <= 2 && !"ref" %in% names(mc)){
      # ref is implicitly called via the location of var
      ref = var
      dots$ref_special = FALSE
    } else {

      if(grepl("^[[:alpha:]\\.][[:alnum:]\\._]*:[[:alpha:]\\.][[:alnum:]\\._]*$", var_name)){
        info = strsplit(var_name, ":")[[1]]
        stop("In i(): When `var` is equal to a product, please use I(", info[1], "*", info[2], ") instead of ", var_name, ".")
      } else {
        stop("The arguments `var` and `f` must be of the same length (currently ", length(var), " vs ", length(f), ").")
      }
    }
  }

  IS_INTER = IS_INTER_NUMERIC || IS_INTER_FACTOR

  if(isTRUE(dots$ref_special) && (!IS_INTER || IS_INTER_FACTOR)){
    ref = TRUE
  }

  #
  # QUFing + NA
  #

  if(IS_INTER){
    is_na_all = is.na(f) | is.na(var)
  } else {
    is_na_all = is.na(f)
  }

  if(!missing(bin)){
    bin = error_sender(eval_dot(bin), arg_name = "bin")

    if(!is.null(bin)){
      f = bin_factor(bin, f, f_name)
    }
  }

  if(IS_INTER_FACTOR && !MISSNULL(bin2)){
    bin2 = error_sender(eval_dot(bin2), arg_name = "bin2")

    if(!is.null(bin2)){
      var = bin_factor(bin2, var, f_name)
    }
  }

  if(IS_INTER_FACTOR){
    info = to_integer(f, var, add_items = TRUE, items.list = TRUE, sorted = TRUE, multi.join = "__%%__")
  } else {
    info = to_integer(f, add_items = TRUE, items.list = TRUE, sorted = TRUE)
  }

  fe_num = info$x
  items = info$items

  if(!IS_INTER_NUMERIC){
    # neutral var in C code
    var = 1
  }

  if(IS_INTER_FACTOR){
    items_split = strsplit(items, "__%%__", fixed = TRUE)

    f_items = sapply(items_split, `[`, 1)
    var_items = sapply(items_split, `[`, 2)
  } else {
    f_items = items
  }

  check_arg(ref, "logical scalar | vector no na")

  check_arg(ref2, keep, keep2, "vector no na")

  NO_ERROR = FALSE
  if(is_calling_fun("fixest_model_matrix_extra", full_search = TRUE, full_name = TRUE)){
    NO_ERROR = TRUE
  }

  no_rm = TRUE
  id_drop = c()
  if(!missing(ref)){
    if(isTRUE(ref)){
      # We always delete the first value
      # Que ce soit items ici est normal (et pas f_items)
      id_drop = which(items == items[1])
    } else {
      id_drop = items_to_drop(f_items, ref, "factor_var", no_error = NO_ERROR)
    }
    ref_id = id_drop
  }


  if(!missing(keep)){
    id_drop = c(id_drop, items_to_drop(f_items, keep, "factor_var", keep = TRUE, no_error = NO_ERROR))
  }

  if(IS_INTER_FACTOR){
    if(!missing(ref2)){
      id_drop = c(id_drop, items_to_drop(var_items, ref2, "var", no_error = NO_ERROR))
    }

    if(!missing(keep2)){
      id_drop = c(id_drop, items_to_drop(var_items, keep2, "var", keep = TRUE, no_error = NO_ERROR))
    }
  }

  if(length(id_drop) > 0){
    id_drop = unique(sort(id_drop))
    if(length(id_drop) == length(items)){
      if(FROM_FIXEST) {
        # we return something neutral in an estimation
        return(rep(0, length(f)))
      }

      stop("All items from the interaction have been removed.")
    }
    who_is_dropped = id_drop
    no_rm = FALSE
  } else {
    # -1 is neutral
    who_is_dropped = -1
  }

  # The column names

  if(length(id_drop) > 0){
    items_name = items[-id_drop]
    f_items = f_items[-id_drop]
    if(IS_INTER_FACTOR){
      var_items = var_items[-id_drop]
    }
  } else {
    items_name = items
  }

  if(FROM_FIXEST){
    # Pour avoir des jolis noms c'est un vrai gloubiboulga,
    # mais j'ai pas trouve plus simple...
    if(IS_INTER_FACTOR){
      col_names = paste0("__CLEAN__", f_name, "::", f_items, ":", var_name, "::", var_items)
      col_names_full = NULL

    } else if(IS_INTER_NUMERIC){
      col_names = paste0("__CLEAN__", f_name, "::", f_items, ":", var_name)
      col_names_full = paste0(f_name, "::", items, ":", var_name)

    } else {
      col_names = paste0("__CLEAN__", f_name, "::", items_name)
      col_names_full = paste0(f_name, "::", items)
    }
  } else {

    if(IS_INTER_FACTOR){
      items_name = gsub("__%%__", ":", items_name, fixed = TRUE)
    }

    col_names = items_name
  }

  if(is_sparse){
    # Internal call: we return the row ids + the values + the indexes + the names

    if(length(who_is_dropped) > 0){
      valid_row = !is_na_all & !fe_num %in% who_is_dropped
    } else {
      valid_row = !is_na_all
    }

    # we need to ensure the IDs go from 1 to # Unique
    fe_colid = to_integer(fe_num[valid_row], sorted = TRUE)

    values = if(length(var) == 1) rep(1, length(valid_row)) else var
    res = list(rowid = which(valid_row), values = values,
           colid = fe_colid, col_names = col_names)

    class(res) = "i_sparse"

    return(res)
  }

  res = cpp_factor_matrix(fe_num, is_na_all, who_is_dropped, var, col_names)
  # res => matrix with...
  #  - NAs where appropriate
  #  - appropriate number of columns
  #  - interacted if needed
  #


  # We send the information on the reference
  if(FROM_FIXEST){
    is_GLOBAL = FALSE
    for(where in 1:min(8, sys.nframe())){
      if(exists("GLOBAL_fixest_mm_info", parent.frame(where))){
        GLOBAL_fixest_mm_info = get("GLOBAL_fixest_mm_info", parent.frame(where))
        is_GLOBAL = TRUE
        break
      }
    }

    if(is_GLOBAL && !IS_INTER_FACTOR){
      # NOTA:
      # if you change stuff here => change them also in sunab()

      info = list()
      info$coef_names_full = col_names_full
      info$items = items
      if(!missing(ref)){
        info$ref_id = ref_id
        info$ref = items[ref_id]
      }
      info$is_num = is.numeric(items)
      info$f_name = f_name
      info$is_inter_num = IS_INTER_NUMERIC
      if(IS_INTER_NUMERIC){
        info$var_name = var_name
      }
      info$is_inter_fact = IS_INTER_FACTOR

      GLOBAL_fixest_mm_info[[length(GLOBAL_fixest_mm_info) + 1]] = info
      # re assignment
      assign("GLOBAL_fixest_mm_info", GLOBAL_fixest_mm_info, parent.frame(where))
    }

  }

  res
}


i_ref = function(factor_var, var, ref, bin, keep, ref2, keep2, bin2){
  # To automatically add references when i(x) is used

  mc = match.call()

  mc[[1]] = as.name("i")

  if(!any(c("ref", "keep", "ref2", "keep2") %in% names(mc))){
    mc$ref_special = TRUE
  }

  return(deparse_long(mc))
}

i_noref = function(factor_var, var, ref, bin, keep, ref2, keep2, bin2){
  # Used only in predict => to create data without restriction

  mc = match.call()

  mc[[1]] = as.name("i")

  mc$ref = mc$keep = mc$ref2 = mc$keep2 = NULL

  return(deparse_long(mc))
}





#' Expands formula macros
#'
#' Create macros within formulas and expand them with character vectors or other formulas.
#'
#' @inheritParams setFixest_fml
#'
#' @param fml A formula containing macros variables. Each macro variable must start with two dots. 
#' The macro variables can be set globally using `setFixest_fml`, or can be defined in `...`. 
#' Special macros of the form `..("regex")` can be used to fetch, through a regular expression, 
#' variables directly in a character vector (or in column names) given in the argument `data` (note 
#' that the algorithm tries to "guess" the argument data when nested in function calls \[see 
#' example\]). You can negate the regex by starting with a `"!"`. Square brackets have a special 
#' meaning: Values in them are evaluated and parsed accordingly. Example: `y~x.[1:2] + z.[i]` will 
#' lead to `y~x1+x2+z3` if `i==3`. You can trigger the auto-completion of variables by using the 
#' `'..'` suffix, like in `y ~ x..` which would include `x1` and `x2`, etc. See examples.
#' @param add Either a character scalar or a one-sided formula. The elements will be added to the 
#' right-hand-side of the formula, before any macro expansion is applied.
#' @param lhs If present then a formula will be constructed with `lhs` as the full left-hand-side. 
#' The value of `lhs` can be a one-sided formula, a call, or a character vector. Note that the 
#' macro variables wont be applied. You can use it in combination with the argument `rhs`. Note 
#' that if `fml` is not missing, its LHS will be replaced by `lhs`.
#' @param rhs If present, then a formula will be constructed with `rhs` as the full 
#' right-hand-side. The value of `rhs` can be a one-sided formula, a call, or a character vector. 
#' Note that the macro variables wont be applied. You can use it in combination with the argument 
#' `lhs`. Note that if `fml` is not missing, its RHS will be replaced by `rhs`.
#' @param data Either a character vector or a data.frame. This argument will only be used if a 
#' macro of the type `..("regex")` is used in the formula of the argument `fml`. If so, any 
#' variable name from `data` that matches the regular expression will be added to the formula.
#' @param frame The environment containing the values to be expanded with the dot square bracket 
#' operator. Default is `parent.frame()`.
#'
#' @details
#' In `xpd`, the default macro variables are taken from `getFixest_fml`. Any value in the `...` 
#' argument of `xpd` will replace these default values.
#'
#' The definitions of the macro variables will replace in verbatim the macro variables. Therefore, 
#' you can include multi-part formulas if you wish but then beware of the order of the macros 
#' variable in the formula. For example, using the `airquality` data, say you want to set as 
#' controls the variable `Temp` and `Day` fixed-effects, you can do 
#' `setFixest_fml(..ctrl = ~Temp | Day)`, but then `feols(Ozone ~ Wind + ..ctrl, airquality)` 
#' will be quite different from `feols(Ozone ~ ..ctrl + Wind, airquality)`, so beware!
#'
#'
#' @section Dot square bracket operator in formulas:
#'
#' In a formula, the dot square bracket (DSB) operator can: i) create manifold variables at once, 
#' or ii) capture values from the current environment and put them verbatim in the formula.
#'
#' Say you want to include the variables `x1` to `x3` in your formula. You can use 
#' `xpd(y ~ x.[1:3])` and you'll get `y ~ x1 + x2 + x3`.
#'
#' To summon values from the environment, simply put the variable in square brackets. For example: 
#' `for(i in 1:3) xpd(y.[i] ~ x)` will create the formulas `y1 ~ x` to `y3 ~ x` depending on the 
#' value of `i`.
#'
#' You can include a full variable from the environment in the same way: 
#' `for(y in c("a", "b")) xpd(.[y] ~ x)` will create the two formulas `a ~ x` and `b ~ x`.
#'
#' The DSB can even be used within variable names, but then the variable must be nested in 
#' character form. For example `y ~ .["x.[1:2]_sq"]` will create `y ~ x1_sq + x2_sq`. Using the 
#' character form is important to avoid a formula parsing error. Double quotes must be used. Note 
#' that the character string that is nested will be parsed with the function [`dsb`], and thus it 
#' will return a vector.
#'
#' By default, the DSB operator expands vectors into sums. You can add a comma, like in `.[, x]`, 
#' to expand with commas--the content can then be used within functions. For instance: 
#' `c(x.[, 1:2])` will create `c(x1, x2)` (and *not* `c(x1 + x2)`).
#'
#' In all `fixest` estimations, this special parsing is enabled, so you don't need to use `xpd`.
#'
#' One-sided formulas can be expanded with the DSB operator: let `x = ~sepal + petal`, then 
#' `xpd(y ~ .[x])` leads to `color ~ sepal + petal`.
#'
#' You can even use multiple square brackets within a single variable, but then the use of nesting 
#' is required. For example, the following `xpd(y ~ .[".[letters[1:2]]_.[1:2]"])` will create
#'  `y ~ a_1 + b_2`. Remember that the nested character string is parsed with [`dsb`], 
#' which explains this behavior.
#'
#' When the element to be expanded i) is equal to the empty string or, ii) is of length 0, it is 
#' replaced with a neutral element, namely `1`. For example, `x = "" ; xpd(y ~ .[x])` leads to
#'  `y ~ 1`.
#'
#' @section Regular expressions:
#'
#' You can catch several variable names at once by using regular expressions. To use regular 
#' expressions, you need to enclose it in the dot-dot or the regex function: `..("regex")` or 
#' `regex("regex")`. For example, `regex("Sepal")` will catch both the variables `Sepal.Length` and 
#' `Sepal.Width` from the `iris` data set. In a `fixest` estimation, the variables names from which 
#' the regex will be applied come from the data set. If you use `xpd`, you need to provide either a 
#' data set or a vector of names in the argument `data`.
#'
#' By default the variables are aggregated with a sum. For example in a data set with the variables 
#' x1 to x10, `regex("x(1|2)"` will yield `x1 + x2 + x10`. You can instead ask for "comma" 
#' aggregation by using a comma first, just before the regular expression: 
#' `y ~ sw(regex(,"x(1|2)"))` would lead to `y ~ sw(x1, x2, x10)`.
#'
#' Note that the dot square bracket operator (DSB, see before) is applied before the regular 
#' expression is evaluated. This means that `regex("x.[3:4]_sq")` will lead, after evaluation of 
#' the DSB, to `regex("x3_sq|x4_sq")`. It is a handy way to insert range of numbers in a regular 
#' expression.
#'
#'
#' @return
#' It returns a formula where all macros have been expanded.
#'
#' @author
#' Laurent Berge
#'
#'
#' @seealso
#' [`setFixest_fml`] to set formula macros, and [`dsb`] to modify character strings with the DSB operator.
#'
#' @examples
#'
#' # Small examples with airquality data
#' data(airquality)
#' # we set two macro variables
#' setFixest_fml(..ctrl = ~ Temp + Day,
#'               ..ctrl_long = ~ poly(Temp, 2) + poly(Day, 2))
#'
#' # Using the macro in lm with xpd:
#' lm(xpd(Ozone ~ Wind + ..ctrl), airquality)
#' lm(xpd(Ozone ~ Wind + ..ctrl_long), airquality)
#'
#' # You can use the macros without xpd() in fixest estimations
#' a = feols(Ozone ~ Wind + ..ctrl, airquality)
#' b = feols(Ozone ~ Wind + ..ctrl_long, airquality)
#' etable(a, b, keep = "Int|Win")
#'
#'
#' # Using .[]
#'
#' base = setNames(iris, c("y", "x1", "x2", "x3", "species"))
#' i = 2:3
#' z = "species"
#' lm(xpd(y ~ x.[2:3] + .[z]), base)
#'
#' # No xpd() needed in feols
#' feols(y ~ x.[2:3] + .[z], base)
#'
#' #
#' # Auto completion with '..' suffix
#' #
#'
#' # You can trigger variables autocompletion with the '..' suffix
#' # You need to provide the argument data
#' base = setNames(iris, c("y", "x1", "x2", "x3", "species"))
#' xpd(y ~ x.., data = base)
#'
#' # In fixest estimations, this is automatically taken care of
#' feols(y ~ x.., data = base)
#'
#'
#' #
#' # You can use xpd for stepwise estimations
#' #
#'
#' # Note that for stepwise estimations in fixest, you can use
#' # the stepwise functions: sw, sw0, csw, csw0
#' # -> see help in feols or in the dedicated vignette
#'
#' # we want to look at the effect of x1 on y
#' # controlling for different variables
#'
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#'
#' # We first create a matrix with all possible combinations of variables
#' my_args = lapply(names(base)[-(1:2)], function(x) c("", x))
#' (all_combs = as.matrix(do.call("expand.grid", my_args)))
#'
#' res_all = list()
#' for(i in 1:nrow(all_combs)){
#'   res_all[[i]] = feols(xpd(y ~ x1 + ..v, ..v = all_combs[i, ]), base)
#' }
#'
#' etable(res_all)
#' coefplot(res_all, group = list(Species = "^^species"))
#'
#' #
#' # You can use macros to grep variables in your data set
#' #
#'
#' # Example 1: setting a macro variable globally
#'
#' data(longley)
#' setFixest_fml(..many_vars = grep("GNP|ployed", names(longley), value = TRUE))
#' feols(Armed.Forces ~ Population + ..many_vars, longley)
#'
#' # Example 2: using ..("regex") or regex("regex") to grep the variables "live"
#'
#' feols(Armed.Forces ~ Population + ..("GNP|ployed"), longley)
#'
#' # Example 3: same as Ex.2 but without using a fixest estimation
#'
#' # Here we need to use xpd():
#' lm(xpd(Armed.Forces ~ Population + regex("GNP|ployed"), data = longley), longley)
#'
#' # Stepwise estimation with regex: use a comma after the parenthesis
#' feols(Armed.Forces ~ Population + sw(regex(,"GNP|ployed")), longley)
#'
#' # Multiple LHS
#' etable(feols(..("GNP|ployed") ~ Population, longley))
#'
#'
#' #
#' # lhs and rhs arguments
#' #
#'
#' # to create a one sided formula from a character vector
#' vars = letters[1:5]
#' xpd(rhs = vars)
#'
#' # Alternatively, to replace the RHS
#' xpd(y ~ 1, rhs = vars)
#'
#' # To create a two sided formula
#' xpd(lhs = "y", rhs = vars)
#'
#' #
#' # argument 'add'
#' #
#'
#' xpd(~x1, add = ~ x2 + x3)
#'
#' # also works with character vectors
#' xpd(~x1, add = c("x2", "x3"))
#'
#' # only adds to the RHS
#' xpd(y ~ x, add = ~bon + jour)
#'
#' #
#' # Dot square bracket operator
#' #
#'
#' # The basic use is to add variables in the formula
#' x = c("x1", "x2")
#' xpd(y ~ .[x])
#'
#' # Alternatively, one-sided formulas can be used and their content will be inserted verbatim
#' x = ~x1 + x2
#' xpd(y ~ .[x])
#'
#' # You can create multiple variables at once
#' xpd(y ~ x.[1:5] + z.[2:3])
#'
#' # You can summon variables from the environment to complete variables names
#' var = "a"
#' xpd(y ~ x.[var])
#'
#' # ... the variables can be multiple
#' vars = LETTERS[1:3]
#' xpd(y ~ x.[vars])
#'
#' # You can have "complex" variable names but they must be nested in character form
#' xpd(y ~ .["x.[vars]_sq"])
#'
#' # DSB can be used within regular expressions
#' re = c("GNP", "Pop")
#' xpd(Unemployed ~ regex(".[re]"), data = longley)
#'
#' # => equivalent to regex("GNP|Pop")
#'
#' # Use .[,var] (NOTE THE COMMA!) to expand with commas
#' # !! can break the formula if missused
#' vars = c("wage", "unemp")
#' xpd(c(y.[,1:3]) ~ csw(.[,vars]))
#'
#'
#' # Example of use of .[] within a loop
#' res_all = list()
#' for(p in 1:3){
#'   res_all[[p]] = feols(Ozone ~ Wind + poly(Temp, .[p]), airquality)
#' }
#'
#' etable(res_all)
#'
#' # The former can be compactly estimated with:
#' res_compact = feols(Ozone ~ Wind + sw(.[, "poly(Temp, .[1:3])"]), airquality)
#'
#' etable(res_compact)
#'
#' # How does it work?
#' # 1)  .[, stuff] evaluates stuff and, if a vector, aggregates it with commas
#' #     Comma aggregation is done thanks to the comma placed after the square bracket
#' #     If .[stuff], then aggregation is with sums.
#' # 2) stuff is evaluated, and if it is a character string, it is evaluated with
#' # the function dsb which expands values in .[]
#' #
#' # Wrapping up:
#' # 2) evaluation of dsb("poly(Temp, .[1:3])") leads to the vector:
#' #    c("poly(Temp, 1)", "poly(Temp, 2)", "poly(Temp, 3)")
#' # 1) .[, c("poly(Temp, 1)", "poly(Temp, 2)", "poly(Temp, 3)")] leads to
#' #    poly(Temp, 1), poly(Temp, 2), poly(Temp, 3)
#' #
#' # Hence sw(.[, "poly(Temp, .[1:3])"]) becomes:
#' #       sw(poly(Temp, 1), poly(Temp, 2), poly(Temp, 3))
#'
#'
#' #
#' # In non-fixest functions: guessing the data allows to use regex
#' #
#'
#' # When used in non-fixest functions, the algorithm tries to "guess" the data
#' # so that ..("regex") can be directly evaluated without passing the argument 'data'
#' data(longley)
#' lm(xpd(Armed.Forces ~ Population + ..("GNP|ployed")), longley)
#'
#' # same for the auto completion with '..'
#' lm(xpd(Armed.Forces ~ Population + GN..), longley)
#'
#'
xpd = function(fml, ..., add = NULL, lhs, rhs, data = NULL, frame = parent.frame()){

  if(MISSNULL(data)){
    # We "guess" the data
    sc = sys.calls()
    n_sc = length(sc)
    if(n_sc > 1){
      mc = tryCatch(match.call(definition = sys.function(n_sc - 1), call = sys.call(n_sc - 1)), error = function(e) NULL)
      if("data" %in% names(mc)){
        data = tryCatch(eval(mc$data, parent.frame(2)), error = function(e) NULL)
      }
    }
  }

  .xpd(fml = fml, ..., add = add, lhs = lhs, rhs = rhs, data = data, 
       check = TRUE, macro = TRUE, frame = frame)
}

.xpd = function(fml, ..., add = NULL, lhs, rhs, data = NULL, check = FALSE, 
                macro = FALSE, frame = .GlobalEnv){

  is_lhs = !missing(lhs)
  is_rhs = !missing(rhs)
  if(is_lhs || is_rhs){

    if(check) check_arg(fml, .type = "formula", .up = 1)

    # Direct formula creation
    if(missing(fml)){
      res = if(is_lhs) 1 ~ 1 else ~ 1
    } else {
      if(is_lhs && length(fml)[1] == 2){
        # Means: RHS formula and we add the LHS
        res = 1 ~ 1
        res[[3]] = fml[[2]]

      } else {
        res = fml
      }
    }

    if(is_lhs){
      res[[2]] = value2stringCall(lhs, call = TRUE, check = check)
    }

    if(is_rhs){
      res[[length(res)]] = value2stringCall(rhs, call = TRUE, check = check)
    }

    fml = res

    attr(fml, ".Environment") = frame

    if(!macro && missnull(add)) return(fml)

    # NOTA:
    # if we allow for macro implementation ex post:
    # This entails a 50% performance drop in terms of speed.
    # Now, without macro variables, speed is at 30us while it was 20us before
    # so in .xpd => macro argument

  } else if(!missing(add)){

    if(check){
      check_arg(fml, .type = "NULL formula", .up = 1)
    }

    if(missnull(fml)){
      fml = ~ 1
      fml[[2]] = value2stringCall(add, call = TRUE, check = check)
      add = NULL

      attr(fml, ".Environment") = frame
    }

  } else if(check){
    check_arg(fml, .type = "formula mbt", .up = 1)
  }


  if(!missnull(add)){
    # Direct formula manipulation is too complicated (and I want to avoid ugly parentheses)
    # by string it's easy
    fml_dp = deparse_long(fml)

    add_txt = value2stringCall(add, call = FALSE, check = check)
    add_txt = gsub("^~", "", add_txt)

    fml = as.formula(paste0(fml_dp, "+", add_txt), frame)
  }

  macros = parse_macros(..., from_xpd = TRUE, check = check, frame = frame)

  is_macro = length(macros) != 0
  is_data = !missnull(data)
  fml_funs = all.vars(fml, functions = TRUE)
  is_brackets = "[" %in% fml_funs
  is_regex = any(c("regex", "..") %in% fml_funs)
  if(!(is_macro || is_data || is_brackets || is_regex)) return(fml)

  fml_dp = NULL

  if(is_macro){
    # We allow recursion + auto-blocking at 5th depth
    # to allow crazy things from the user side (but not too much)
    max_depth = 5
    depth  = 0
    any_done = TRUE
    qui = which(names(macros) %in% all.vars(fml))
    while(length(qui) > 0 && any_done && depth < max_depth){
      depth = depth + 1
      fml_dp_origin = fml_dp = deparse_long(fml)
      # We need to add a lookafter assertion: otherwise if we have ..ctrl + ..ctrl_long, there will be a replacement in ..ctrl_long
      for(i in qui){
        fml_dp = gsub(paste0(escape_regex(names(macros)[i]), "(?=$|[^[:alnum:]_\\.])"), macros[[i]], fml_dp, perl = TRUE)
      }

      fml = as.formula(fml_dp, frame)

      qui = which(names(macros) %in% all.vars(fml))
      any_done = fml_dp_origin != fml_dp
    }

    if(depth == max_depth && length(qui) > 0){
      warni("In xpd, max recursivity has been reached. ",
            "Please revise the recursive definition of your variables. ",
            "It concerns the variable{$s, q, enum ? names(macros)[qui]}.")
    }
  }

  data_vars = NULL
  if(is_regex){
    # We expand only if data is provided (it means the user wants us to check)
    # if .[]: we expand inside the ..(".[1:3]") => ..("1|2|3")

    if(is.null(fml_dp)) fml_dp = deparse_long(fml)
    is_brackets = grepl(".[", fml_dp, fixed = TRUE)

    if(is_data || is_brackets){

      if(is.null(fml_dp)) fml_dp = deparse_long(fml)

      if(is_data){
        check_arg(data, "character vector no na | matrix | data.frame")

        if(is.matrix(data)){
          data = colnames(data)
          if(is.null(data)){
            stop("The argument `data` must contain variables names. It is currently a matrix without column names.")
          }
        } else if(is.data.frame(data)){
          data = names(data)
        }

        data_vars = data
      }

      fml_dp_split = strsplit(fml_dp, '(?<![[:alnum:]._])(regex|\\.\\.)\\((?=[,"])',
                              perl = TRUE)[[1]]

      res = fml_dp_split
      for(i in 2:length(res)){

        re = sub('"\\).*', "", res[i])

        re_width = nchar(re)

        is_comma = grepl("^,", re)
        re = gsub("^,? ?\"", "", re)

        re = dot_square_bracket(re, frame, regex = TRUE)

        if(is_data){
          if(substr(re, 1, 1) == "!"){
            # negation
            re = str_trim(re, 1)
            vars = data[!grepl(re, data, perl = TRUE)]
          } else {
            vars = grep(re, data, value = TRUE, perl = TRUE)
          }

          if(length(vars) == 0){
            vars = "1"
          }

          coll = if(is_comma) ", " else " + "

          res[i] = paste0(paste(vars, collapse = coll), substr(res[i], re_width + 3, nchar(res[i])))
        } else {
          res[i] = paste0("..(\"", re, "\")", substr(res[i], re_width + 3, nchar(res[i])))
        }

      }

      fml_txt = paste(res, collapse = "")
      fml = error_sender(as.formula(fml_txt, frame),
                         "Expansion of variables in ..(\"regex\"): coercion of the following text to a formula led to an error.\n",
                         fit_screen(paste0("...TEXT: `", fml_txt, "`")),
                         "\nPROBLEM: see below", up = 1)
    }
  }

  if("[" %in% all.vars(fml, functions = TRUE)){
    fml_txt = deparse_long(fml)
    if(grepl(".[", fml_txt, fixed = TRUE)){
      fml_txt = dot_square_bracket(fml_txt, frame)

      # level 1 recursivity
      if(grepl(".[", fml_txt, fixed = TRUE)){
        fml_txt = dot_square_bracket(fml_txt, frame)
      }

      fml = error_sender(as.formula(fml_txt, frame),
                 "Dot square bracket operator: coercion of the following text to a formula led to an error.\n",
                 fit_screen(paste0("...TEXT: `", fml_txt, "`")),
                 "\nPROBLEM: see below", up = 1)
    }

  }

  if(is_data){
    var_to_complete = grep("[[:alnum:]]\\.\\.$", all.vars(fml), value = TRUE)
    n_var = length(var_to_complete)
    if(n_var > 0){

      if(is.null(data_vars)){
        check_arg(data, "character vector no na | matrix | data.frame")

        if(is.matrix(data)){
          data = colnames(data)
          if(is.null(data)){
            stop("The argument `data` must contain variables names. It is currently a matrix without column names.")
          }
        } else if(is.data.frame(data)){
          data = names(data)
        }

        data_vars = data
      }

      fml_txt = deparse_long(fml)
      for(i in 1:n_var){
        var = var_to_complete[i]
        vars_filled = data_vars[startsWith(data_vars, str_trim(var, n_last = 2))]
        if(length(vars_filled) == 0) next

        vars_new = paste0(vars_filled, collapse = " + ")

        pattern = sma("(?<![[:alnum:]._])\\Q{var}\\E(?![[:alnum:]._])")

        fml_txt = gsub(pattern, vars_new, fml_txt, perl = TRUE)
      }

      fml = error_sender(as.formula(fml_txt, frame),
                         "Expansion of variables ending with `..` did not work. Coercion of the following text to a formula led to an error.\n",
                         fit_screen(paste0("...TEXT: `", fml_txt, "`")),
                         "\nPROBLEM: see below", up = 1)
    }
  }

  fml
}


#' Centers a set of variables around a set of factors
#'
#' User-level access to internal demeaning algorithm of `fixest`.
#' 
#' @inheritParams feols
#' @inheritSection feols Varying slopes
#'
#' @param X A matrix, vector, data.frame or a list OR a formula OR a [`feols`] estimation. If equal 
#' to a formula, then the argument `data` is required, and it must be of the type: 
#' `x1 + x2 ~ f1 + fe2` with on the LHS the variables to be centered, and on the RHS the factors 
#' used for centering. Note that you can use variables with varying slopes with the syntax 
#' `fe[v1, v2]` (see details in [`feols`]). If a `feols` estimation, all variables (LHS+RHS) are 
#' demeaned and then returned (only if it was estimated with fixed-effects). Otherwise, it must 
#' represent the data to be centered. Of course the number of observations of that data must be the 
#' same as the factors used for centering (argument `f`).
#' @param f A matrix, vector, data.frame or list. The factors used to center the variables in 
#' argument `X`. Matrices will be coerced using `as.data.frame`.
#' @param slope.vars A vector, matrix or list representing the variables with varying slopes. 
#' Matrices will be coerced using `as.data.frame`. Note that if this argument is used it MUST be in 
#' conjunction with the argument `slope.flag` that maps the factors to which the varying slopes are 
#' attached. See examples.
#' @param slope.flag An integer vector of the same length as the number of variables in `f` (the 
#' factors used for centering). It indicates for each factor the number of variables with varying 
#' slopes to which it is associated. Positive values mean that the raw factor should also be 
#' included in the centering, negative values that it should be excluded. Sorry it's complicated... 
#' but see the examples it may get clearer.
#' @param data A data.frame containing all variables in the argument `X`. Only used if `X` is a 
#' formula, in which case `data` is mandatory.
#' @param weights Vector, can be missing or NULL. If present, it must contain the same number of 
#' observations as in `X`.
#' @param nthreads Number of threads to be used. By default it is equal to `getFixest_nthreads()`.
#' @param notes Logical, whether to display a message when NA values are removed. By default it is 
#' equal to `getFixest_notes()`.
#' @param iter Number of iterations, default is 2000.
#' @param tol Stopping criterion of the algorithm. Default is `1e-6`. The algorithm stops when the 
#' maximum absolute increase in the coefficients values is lower than `tol`.
#' @param na.rm Logical, default is `TRUE`. If `TRUE` and the input data contains any NA value, 
#' then any observation with NA will be discarded leading to an output with less observations than 
#' the input. If `FALSE`, if NAs are present the output will also be filled with NAs for each NA 
#' observation in input.
#' @param as.matrix Logical, if `TRUE` a matrix is returned, if `FALSE` it will be a data.frame. 
#' The default depends on the input, if atomic then a matrix will be returned.
#' @param im_confident Logical, default is `FALSE`. FOR EXPERT USERS ONLY! This argument allows to 
#' skip some of the preprocessing of the arguments given in input. If `TRUE`, then `X` MUST be a 
#' numeric vector/matrix/list (not a formula!), `f` MUST be a list, `slope.vars` MUST be a list, 
#' `slope.vars` MUST be consistent with `slope.flag`, and `weights`, if given, MUST be numeric (not 
#' integer!). Further there MUST be not any NA value, and the number of observations of each 
#' element MUST be consistent. Non compliance to these rules may simply lead your R session to 
#' break.
#' @param fixef.reorder Logical, default is `TRUE`. Whether to reorder the fixed-effects by 
#' frequencies before feeding them into the algorithm. If `FALSE`, the original fixed-effects order 
#' provided by the user is maintained. In general, reordering leads to faster and more precise 
#' performance.
#' @param ... Not currently used.
#'
#' @return
#' It returns a data.frame of the same number of columns as the number of variables to be centered.
#'
#' If `na.rm = TRUE`, then the number of rows is equal to the number of rows in input minus the 
#' number of NA values (contained in `X`, `f`, `slope.vars` or `weights`). The default is to have 
#' an output of the same number of observations as the input (filled with NAs where appropriate).
#'
#' A matrix can be returned if `as.matrix = TRUE`.
#'
#' @examples
#'
#' # Illustration of the FWL theorem
#' data(trade)
#'
#' base = trade
#' base$ln_dist = log(base$dist_km)
#' base$ln_euros = log(base$Euros)
#'
#' # We center the two variables ln_dist and ln_euros
#' #  on the factors Origin and Destination
#' X_demean = demean(X = base[, c("ln_dist", "ln_euros")],
#'                   f = base[, c("Origin", "Destination")])
#' base[, c("ln_dist_dm", "ln_euros_dm")] = X_demean
#'
#' est = feols(ln_euros_dm ~ ln_dist_dm, base)
#' est_fe = feols(ln_euros ~ ln_dist | Origin + Destination, base)
#'
#' # The results are the same as if we used the two factors
#' # as fixed-effects
#' etable(est, est_fe, se = "st")
#'
#' #
#' # Variables with varying slopes
#' #
#'
#' # You can center on factors but also on variables with varying slopes
#'
#' # Let's have an illustration
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#'
#' #
#' # We center y and x1 on species and x2 * species
#'
#' # using a formula
#' base_dm = demean(y + x1 ~ species[x2], data = base)
#'
#' # using vectors
#' base_dm_bis = demean(X = base[, c("y", "x1")], f = base$species,
#'                      slope.vars = base$x2, slope.flag = 1)
#'
#' # Let's look at the equivalences
#' res_vs_1 = feols(y ~ x1 + species + x2:species, base)
#' res_vs_2 = feols(y ~ x1, base_dm)
#' res_vs_3 = feols(y ~ x1, base_dm_bis)
#'
#' # only the small sample adj. differ in the SEs
#' etable(res_vs_1, res_vs_2, res_vs_3, keep = "x1")
#'
#' #
#' # center on x2 * species and on another FE
#'
#' base$fe = rep(1:5, 10)
#'
#' # using a formula => double square brackets!
#' base_dm = demean(y + x1 ~ fe + species[[x2]], data = base)
#'
#' # using vectors => note slope.flag!
#' base_dm_bis = demean(X = base[, c("y", "x1")], f = base[, c("fe", "species")],
#'                      slope.vars = base$x2, slope.flag = c(0, -1))
#'
#' # Explanations slope.flag = c(0, -1):
#' # - the first 0: the first factor (fe) is associated to no variable
#' # - the "-1":
#' #    * |-1| = 1: the second factor (species) is associated to ONE variable
#' #    *   -1 < 0: the second factor should not be included as such
#'
#' # Let's look at the equivalences
#' res_vs_1 = feols(y ~ x1 + i(fe) + x2:species, base)
#' res_vs_2 = feols(y ~ x1, base_dm)
#' res_vs_3 = feols(y ~ x1, base_dm_bis)
#'
#' # only the small sample adj. differ in the SEs
#' etable(res_vs_1, res_vs_2, res_vs_3, keep = "x1")
#'
#'
#'
#'
demean = function(X, f, slope.vars, slope.flag, data, weights,
                  nthreads = getFixest_nthreads(), notes = getFixest_notes(),
                  iter = 2000, tol = 1e-6, 
                  fixef.reorder = TRUE, fixef.algo = NULL,
                  na.rm = TRUE, as.matrix = is.atomic(X),
                  im_confident = FALSE, ...) {


  ANY_NA = FALSE
  # SK: to reassign class if X is data.frame, data.table or tibble. Optimally you would preserve all attributes,
  # but using attributes<- is slow on data.frames. What I did in collapse is export the SET_ATTRIB and DUPLICATE_ATTRIB
  # functions from the C-API to use them internally in R -> copy attributes without checks at 0 cost, even for large data.frames.

  # LB: next line is needed if input data is matrix and as.matrix is set to FALSE
  clx = NULL
  is_fixest = inherits(X, "fixest")
  if(lX <- is.list(X) && !is_fixest) {
    clx <- oldClass(X)
    # SK: oldClass is faster and returns NULL when the list is plain. class returns the implicit class "list".
    # This is the key to fast R code -> all data.frame methods are super slow and should duly be avoided in internal code.
    oldClass(X) <- NULL
  }
  # LB: The reassignment of attributes to data.frames is actually a very good idea, thanks Seb!

  # LB: To avoid delayed evaluation problems (due to new default is.atomic(X))
  as_matrix = as.matrix

  # internal argument
  fe_info = FALSE

  # Step 1: formatting the input
  if(!im_confident){

    check_arg(X, "numeric vmatrix | list | formula | class(fixest) mbt")
    check_arg(iter, "integer scalar GE{1}")
    check_arg(tol, "numeric scalar GT{0}")
    check_arg(notes, "logical scalar")
    check_arg("NULL class(demeaning_algo)", fixef.algo)

    validate_dots(valid_args = "fe_info", stop = TRUE)
    dots = list(...)
    fe_info = isTRUE(dots$fe_info)

    data_mbt = TRUE
    if(inherits(X, "fixest")){
      data_mbt = FALSE
      if(!identical(X$method, "feols")){
        stop("This function only works for 'feols' estimations (not for ", X$method, ").")
      }

      if(!fe_info && !is.null(X$y_demeaned)){
        return(cbind(X$y_demeaned, X$X_demeaned))
      }

      if(!"fixef_vars" %in% names(X)){
        stop("To apply demeaning, the estimation must contain fixed-effects, this is currently not the case.")
      }
      
      # controls for the demeaning algorithm
      if(!is.null(X$fixef.algo)){
        if(missnull(fixef.algo)){
          fixef.algo = X$fixef.algo
        }
      }

      # Otherwise => we create data and X: formula
      data = fetch_data(X)
      fml_linear = X$fml_all$linear
      fml_fixef = X$fml_all$fixef

      fml_vars = .xpd(~ ..lhs + ..rhs,
                      ..lhs = fml_linear[[2]],
                      ..rhs = fml_linear[[3]])

      if(!is.null(X$fml_all$iv)){
        fml_iv = X$fml_all$iv
        fml_vars = .xpd(~ ..vars + ..iv_lhs + ..iv_rhs,
                        ..vars = fml_vars,
                        ..iv_lhs = fml_iv[[2]],
                        ..iv_rhs = fml_iv[[3]])
      }

      fml = .xpd(lhs = fml_vars, rhs = fml_fixef)

      mc = match.call()
      if(!"tol" %in% names(mc)) tol = X$fixef.tol
      if(!"iter" %in% names(mc)) iter = X$fixef.iter

      weights = X[["weights"]]

      X = fml

      as_matrix = TRUE
    }
    
    if(is.null(fixef.algo)){
      fixef.algo = demeaning_algo(internal = TRUE)
    }

    #
    # X
    #

    fe_done = FALSE
    if(is.call(X)) {

      if(data_mbt) check_arg(data, "data.frame mbt")
      check_arg(X, "ts formula var(data)", .data = data)

      # Extracting the information
      terms_fixef = fixef_terms(.xpd(rhs = X[[3L]]))
      # We add the intercept only for only slope models, otherwise this would be void since the result would be always 0
      X = fixest_model_matrix(.xpd(lhs = quote(y), rhs = X[[2L]]), data, 
                              fake_intercept = any(terms_fixef$slope_flag >= 0))
      var_names = dimnames(X)[[2]]

      lX = FALSE # Needed for the rest of the code to work

      # FE
      fe_done = TRUE
      f = unclass(error_sender(prepare_df(terms_fixef$fe_vars, data),
                   "Error when evaluating the fixed-effects variables: "))

      isSlope = any(terms_fixef$slope_flag != 0)

      if(isSlope) {

        slope.vars = unclass(error_sender(prepare_df(terms_fixef$slope_vars, data),
                          "Error when evaluating the variable with varying slopes: "))

        if(anyDuplicated(terms_fixef$slope_vars)){
          # we need it!!!!
          slope.vars = slope.vars[terms_fixef$slope_vars]
        }

        slope.flag = terms_fixef$slope_flag

        is_numeric = vapply(`attributes<-`(slope.vars, NULL), is.numeric, TRUE)
        if(!all(is_numeric)){
          stop("In the fixed-effects part of the formula, variables with varying slopes must be numeric. Currently variable", enumerate_items(names(slope.vars)[!is_numeric], "s.is.quote"), " not.")
        }
      } else {
        slope.vars = list(0)
        slope.flag = rep(0L, length(f))
      }

    } else if(lX) {
      var_names = names(X)
      if(!any(clx == "data.frame")) {
        n_all = lengths(X)
        if(!all(n_all == n_all[1L])) {
          n_unik = unique(n_all)
          stop("In argument 'X' all elements of the list must be of the same length. This is currently not the case (ex: one is of length ", n_unik[1], " while another is of length ", n_unik[2], ").")
        }
      }

      # SK: If your using Rcpp and the input is NumericVector or NumericMatrix, you should get a C++ error for wrongly typed data, so I don't see the need for this
      # LB: I disagree, such error messages are poor and hard to debug for end users.

      is_numeric = vapply(`attributes<-`(X, NULL), is.numeric, TRUE)
      if(!all(is_numeric)){
        # Faster than any(!is_numeric) => LB: yet a few nano seconds saved... ;-)
        n_non_num = sum(!is_numeric)
        stop("All values in 'X' must be numeric. This is not the case for ", n_non_num, " variable", plural(n_non_num), ".")
      }

    } else {
      # Here: numeric matrix or vector
      var_names = dimnames(X)[[2L]]
    }

    # nobs: useful for checks
    nobs = if(is.list(X)) length(.subset2(X, 1L)) else NROW(X)

    #
    # f + slope.vars
    #

    if(!fe_done){
      check_arg(f, "vmatrix | list mbt")
      check_arg(slope.vars, "numeric vmatrix | list")
      check_arg(slope.flag, "integer vector no na")

      if(is.list(f)) {
        if(!is.data.frame(f)) {
          n_all = lengths(f)
          if(!all(n_all == n_all[1L])) {
            n_unik = unique(n_all)
            stop("In argument 'f' all elements of the list must be of the same length. This is currently not the case (ex: one is of length ", n_unik[1], " while another is of length ", n_unik[2], ").")
          }
          # f = as.data.frame(f)
          # SK: as.data.frame is super slow, especially on large data. You already checked the lengths, so it's ok
          # LB: true
        } else {
          oldClass(f) <- NULL
        }

      } else {
        # SK: as.data.frame on matrix is also very slow, you could do in C++ as in at collapse::mctl.. but I suppose most will pass lists of factors anyway..
        # LB: In the long run I'll deal with that but that's really low priority
        f = if(!is.array(f)) list(f) else unclass(as.data.frame(f))
      }

      is_pblm = vapply(`attributes<-`(f, NULL), function(x) !(is.numeric(x) || is.character(x)), TRUE)
      if(any(is_pblm)) {
        for(i in which(is_pblm)) f[[i]] = as.character(f[[i]])
      }

      if(length(f[[1L]]) != nobs){
        stop("The number of observations in 'X' and in 'f' don't match (", if(lX) length(X[[1L]]) else NROW(X), " vs ", length(f[[1L]]), ").")
      }

      # Now the slopes
      isSlope = FALSE
      if(!missnull(slope.vars) || !missnull(slope.flag)) {

        isSlope = TRUE

        if(missnull(slope.vars)) stop("If argument 'slope.flag' is provided, you must also provide the argument 'slope.vars'.")
        if(missnull(slope.flag)) stop("If argument 'slope.vars' is provided, you must also provide the argument 'slope.flag'.")

        if(is.list(slope.vars)){
          if(!is.data.frame(slope.vars)){
            n_all = lengths(slope.vars)
            if(!all(n_all == n_all[1L])) {
              n_unik = unique(n_all)
              stop("In argument 'slope.vars' all elements of the list must be of the same length. This is currently not the case (ex: one is of length ", n_unik[1], " while another is of length ", n_unik[2], ").")
            }

          } else {
            oldClass(slope.vars) <- NULL
          }

          is_numeric = vapply(`attributes<-`(slope.vars, NULL), is.numeric, TRUE)
          # SK: Much faster than !sapply(slope.vars, is.numeric)
          # LB: 10us diff, only impedes readability IMO
          if(!all(is_numeric)) stop("In the argument 'slope.vars', the variables with varying slopes must be numeric. Currently variable", enumerate_items(names(slope.vars)[!is_numeric], "s.is.quote"), " not.")

        } else {
          # SK: Same as above.. Necessary to convert ??
          # LB: The new C++ code accepts lists of vector/matrices or vector/matrices directly // So if we're here (meaning vector or matrix), that's fine
        }

        if(length(slope.flag) != length(f)) stop("The argument 'slope.flag' must be a vector representing, for each fixed-effect, the number of variables with varying slopes associated to it. Problem: the lengths of slope.flag and the fixed-effect differ: ", length(slope.flag), " vs ", length(f), ".")


        # SK: if(sum(abs(slope.flag)) != length(slope.vars))  : This means that slope flag can only be 0, 1 or -1. In the documentation you only talk about positive and negative values. The change I made reflects this.
        # LB: Cmon Sebastian why change sum(abs(slope.flag)) != length(slope.vars) into sum(slope.flag != 0L) != length(slope.vars)?  The time gains lie in a handful of nano second and you've just introduced a bug!

        n_sv = if(is.list(slope.vars)) length(slope.vars) else NCOL(slope.vars)
        if(sum(abs(slope.flag)) != n_sv) stop("The argument 'slope.flag' must be a vector representing, for each fixed-effect, the number of variables with varying slopes associated to it. Problem: currently the number of variables in 'slope.flag' differ from the number of variables in 'slope.vars': ", sum(abs(slope.flag)), " vs ", n_sv, ".")

      } else {
        slope.vars = list(0)
        slope.flag = rep(0L, length(f))
      }
    }


    ## weights
    check_set_arg(weights, "NULL numeric conv vector len(value) GE{0}", .value = nobs)
    is_weight = !missing(weights) && !is.null(weights)

    ## nthreads (Also seems unnecessarily costly: 250 microseconds)
    # LB: I know... but it has to be checked
    if(!missing(nthreads)) nthreads = check_set_nthreads(nthreads)

    #
    # FORMATTING
    #

    # NAs
    is_NA = FALSE
    info_X = cpp_which_na_inf(X, nthreads)

    if(info_X$any_na_inf){
      is_NA = info_X$is_na_inf
      n_na_x = 1L
    } else n_na_x = 0L


    if(anyNA(f)){
      is_na_fe = !complete.cases(f)
      n_na_fe = 1L
      is_NA = is_NA | is_na_fe
    } else n_na_fe = 0L


    is_na_slope = 0L
    if(isSlope){
      info_slopes = cpp_which_na_inf(slope.vars, nthreads)

      if(info_slopes$any_na_inf){
        is_na_slope = info_slopes$is_na_inf
        is_NA = is_NA | is_na_slope
      }
    }


    is_na_weights = 0L
    if(is_weight){
      info_weights = cpp_which_na_inf(weights, nthreads)

      if(info_weights$any_na_inf){
        is_na_weights = info_weights$is_na_inf
        is_NA = is_NA | is_na_weights
      }
    }

    n_na = sum(is_NA)
    if(n_na && notes) {
      # Here is all the error message stuff now: Only evaluated if needed
      if(n_na_x) n_na_x = sum(info_X$is_na_inf)
      if(n_na_fe) n_na_fe = sum(is_na_fe)
      slopes_msg = if(isSlope) paste0(", slope vars: ", sum(is_na_slope)) else ""
      weight_msg = if(is_weight) paste0(", weights: ", sum(is_na_weights)) else ""

      if(n_na == length(is_NA)) stop("All observations contain NA values (Breakup: X: ", n_na_x, ", f: ", n_na_fe, slopes_msg, weight_msg, ").")
      message("NOTE: ", fsignif(n_na), " observation", plural(n_na), " removed because of NA values (Breakup: X: ", n_na_x, ", f: ", n_na_fe, slopes_msg, weight_msg, ").")
    }

    if(n_na) {
      ANY_NA = TRUE
      # SK: subsetting integer is faster than logical !!
      # LB: Indeed, just checked, quite a diff. Good to know!
      cc <- which(!is_NA)
      X = select_obs(X, cc)

      f = lapply(f, `[`, cc)
      if(isSlope) slope.vars = lapply(slope.vars, `[`, cc)
      if(is_weight) weights = weights[cc]
    }

    if(!is_weight) weights = 1

  } else {
    # Need this here, otherwise error:
    var_names = if(lX) names(X) else dimnames(X)[[2L]]
    if(missing(weights) || is.null(weights)) weights = 1
    if(missnull(slope.vars) || missnull(slope.flag)){
      slope.vars = list(0)
      slope.flag = rep(0L, length(unclass(f)))
      # SK: unclass gives extra speed if f is data.frame, and no cost if f is list.
    }
    
    if(is.null(fixef.algo)){
      fixef.algo = demeaning_algo(internal = TRUE)
    }
  }

  #
  # Unclassing fes
  #

  quf_info_all = cpp_quf_table_sum(x = f, y = 0, do_sum_y = FALSE, rm_0 = FALSE,
                                      rm_1 = FALSE, rm_single = FALSE, do_refactor = FALSE,
                                      r_x_sizes = 0, obs2keep = 0, only_slope = slope.flag < 0L,
                                      nthreads = nthreads)

  # table/sum_y/sizes
  fixef_table = quf_info_all$table
  fixef_sizes = lengths(fixef_table)

  if(fixef.reorder){
    new_order = order(fixef_sizes, decreasing = TRUE)
    if(is.unsorted(new_order)){
      fixef_table = fixef_table[new_order]
      fixef_sizes = fixef_sizes[new_order]
      quf_info_all$quf = quf_info_all$quf[new_order]

      # We reorder slope.vars only if needed (because it is a pain)
      if(sum(slope.flag != 0) >= 2){

        # here slope.vars have 2 or more variables:
        # either matrix or data.frame/list
        if(!is.list(slope.vars)){
          slope.vars = as.data.frame(slope.vars)
        }
        # we ensure it's a plain list
        slope.vars = unclass(slope.vars)

        n_fixef = length(slope.flag)
        slope_vars_list = vector("list", n_fixef)
        id_current = 0
        for(i in 1:n_fixef){
          n_slopes = abs(slope.flag[i])

          if(n_slopes == 0){
            slope_vars_list[[i]] = 0
          } else {
            slope_vars_list[[i]] = slope.vars[1:n_slopes + id_current]
            id_current = id_current + n_slopes
          }
        }

        # Reordering => currently it's quite clumsy (but I HATE VSs!)
        slope.vars = vector("list", sum(abs(slope.flag)))
        slope_vars_list_reordered = slope_vars_list[new_order]
        id_current = 0
        for(i in 1:n_fixef){
          vs = slope_vars_list_reordered[[i]]
          if(is.list(vs)){
            n_vs = length(vs)
            for(j in 1:n_vs){
              slope.vars[[id_current + j]] = vs[[j]]
            }
            id_current = id_current + n_vs
          }
        }
      }
      # slope.flag must be reordered afterwards
      slope.flag = slope.flag[new_order]
    }
  }

  fixef_table_vector = unlist(fixef_table)
  if(!is.integer(slope.flag)) slope.flag = as.integer(slope.flag)

  #
  # The algorithm
  #

  # y => list, X => matrix
  if(as_matrix){
    y = 0
  } else {
    # Quirk => y returns a list only if NCOL(y) > 1 or is.list(y)
    y = if(lX) X else list(X)
    # SK: This does the same, a bit more frugal
    # LB: I tend to prefer late evaluations instead of lX which requires bookkeeping
    X = 0
  }

  vars_demean = cpp_demean(y, X, weights, iterMax = iter,
                           diffMax = tol, r_nb_id_Q = fixef_sizes,
                           fe_id_list = quf_info_all$quf, table_id_I = fixef_table_vector,
                           slope_flag_Q = slope.flag, slope_vars_list = slope.vars,
                           r_init = 0, nthreads = nthreads, 
                           algo_extraProj = fixef.algo$extraProj, 
                           algo_iter_warmup = fixef.algo$iter_warmup, 
                           algo_iter_projAfterAcc = fixef.algo$iter_projAfterAcc, 
                           algo_iter_grandAcc = fixef.algo$iter_grandAcc)

  # Internal call
  if(fe_info){

    res = list(y = vars_demean$y_demean, X = vars_demean$X_demean,
               weights = weights, fixef_id_list = quf_info_all$quf,
               slope_vars = slope.vars, slope_flag = slope.flag, varnames = colnames(X))

    return(res)

  }


  if(as_matrix) {
    if(ANY_NA && na.rm == FALSE) {
      K = NCOL(vars_demean$X_demean)
      res = matrix(NA_real_, length(is_NA), K)
      res[cc, ] = vars_demean$X_demean

    } else {
      res = vars_demean$X_demean
    }

    dimnames(res)[[2L]] = var_names

  } else {
    if(ANY_NA && na.rm == FALSE) {
      n = length(is_NA)
      # SK: One-line efficient solution to the task:
      res = lapply(vars_demean$y_demean, function(x) `[<-`(rep(NA_real_, n), cc, value = x))
      # LB: nice compactness

    } else {
      res = vars_demean$y_demean
    }

    names(res) = var_names
    # SK: This makes your function class-agnostic to lists, data.frame's, data.table's and tibbles (with the exception for any custom data.frame row.names which are not preserved, but the as.data.frame solution also did not do that)
    if(!lX || length(clx)) attr(res, "row.names") <- .set_row_names(length(res[[1L]]))
    # SK: Here !lX || length(clx) means add row.names if either X was a matrix or is classed that is a data.frame
    oldClass(res) <- if(lX) clx else "data.frame"
    # SK: If X is a plain list, and since oldClass returns NULL, this will assign a null class again.
  }
  res
}

#' Extracts the observations used for the estimation
#'
#' This function extracts the observations used in `fixest` estimation.
#'
#' @param x A `fixest` object.
#'
#' @return
#' It returns a simple vector of integers.
#'
#' @examples
#'
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#' base$y[1:5] = NA
#'
#' # Split sample estimations
#' est_split = feols(y ~ x1, base, split = ~species)
#' (obs_setosa = obs(est_split[[1]]))
#' (obs_versi = obs(est_split[sample = "versi", drop = TRUE]))
#'
#' est_versi = feols(y ~ x1, base, subset = obs_versi)
#'
#' etable(est_split, est_versi)
#'
#'
#'
#'
obs = function(x){
  check_arg(x, "class(fixest)")

  if(isTRUE(x$lean)){
    stop("obs() does not work with models estimated with 'lean = TRUE'.")
  }

  id = 1:x$nobs_origin

  for(i in seq_along(x$obs_selection)){
    id = id[x$obs_selection[[i]]]
  }

  return(id)
}




#' Check the fixed-effects convergence of a `feols` estimation
#'
#' Checks the convergence of a `feols` estimation by computing the first-order conditions of all fixed-effects (all should be close to 0)
#'
#' @param x A [`feols`] estimation that should contain fixed-effects.
#' @param object An object returned by `check_conv_feols`.
#' @param type Either "short" (default) or "detail". If "short", only the maximum absolute FOC are 
#' displayed, otherwise the 2 smallest and the 2 largest FOC are reported for each fixed-effect and 
#' each variable.
#' @param ... Not currently used.
#'
#' Note that this function first re-demeans the variables, thus possibly incurring some extra 
#' computation time.
#'
#' @return
#' It returns a list of `N` elements, `N` being the number of variables in the estimation 
#' (dependent variable + explanatory variables +, if IV, endogenous variables and instruments). For 
#' each variable, all the first-order conditions for each fixed-effect are returned.
#'
#' @examples
#'
#' base = setNames(iris, c("y", "x1", "x2", "x3", "species"))
#' base$FE = rep(1:30, 5)
#'
#' # one estimation with fixed-effects + varying slopes
#' est = feols(y ~ x1 | species[x2] + FE[x3], base)
#'
#' # Checking the convergence
#' conv = check_conv_feols(est)
#'
#' # We can check that al values are close to 0
#' summary(conv)
#'
#' summary(conv, "detail")
#'
#'
#'
check_conv_feols = function(x){

  check_arg(x, "class(fixest) mbt")
  if(!identical(x$method, "feols")){
    stop("This function only works for 'feols' estimations (not for ", x$method, ").")
  }

  if(!"fixef_vars" %in% names(x)){
    message("This function only works with fixed-effects (which are currently not present).")
    return(NULL)
  }

  # fixef names for information
  new_order = x$fe.reorder
  fixef_vars = x$fixef_vars[new_order]
  if(!is.null(x$fixef_terms)){

    slope_variables = x$slope_variables_reordered
    slope_flag = x$slope_flag_reordered

    # We reconstruct the terms
    fixef_names = c()
    start = c(0, cumsum(abs(slope_flag)))
    for(i in seq_along(slope_flag)){
      sf = slope_flag[i]
      if(sf >= 0){
        fixef_names = c(fixef_names, fixef_vars[i])
      }

      if(abs(sf) > 0){
        fixef_names = c(fixef_names, paste0(fixef_vars[i], "[[", names(slope_variables)[start[i] + 1:abs(sf)], "]]"))
      }
    }
  } else {
    fixef_names = fixef_vars
  }


  info = demean(x, fe_info = TRUE)

  res = check_conv(y = info$y, X = info$X, fixef_id_list = info$fixef_id_list,
           slope_flag = info$slope_flag, slope_vars = info$slope_vars,
           weights = info$weights, full = TRUE, fixef_names = fixef_names)

  names(res) = info$varnames

  class(res) = "fixest_check_conv"

  res
}

#' @rdname check_conv_feols
summary.fixest_check_conv = function(object, type = "short", ...){

  check_set_arg(type, "match(short, detail)")
  if(is_user_level_call()){
    validate_dots(suggest_args = "type")
  }

  if(type == "short"){
    info_max_abs = lapply(object, function(x) sapply(x, function(y) max(abs(y))))

    info_max_abs = do.call("rbind", info_max_abs)

    cat("Maximum absolute value of the first-order conditions:\n\n")
    print(info_max_abs)

  } else {

    extract_and_format = function(x){
      # x: list

      x_sorted = lapply(x, sort)

      is_short = sapply(x, function(y) length(y) <= 4)

      x_small = list()
      for(i in seq_along(x)){
        xi = x_sorted[[i]]
        if(is_short[i]){
          x_small[[i]] = c(xi, rep(NA, 4 - length(xi)))
        } else {
          x_small[[i]] = c(head(xi, 2), tail(xi, 2))
        }
      }

      x_mat = format(do.call("rbind", x_small), digits = 3)

      x_fmt = c()
      for(i in seq_along(x)){
        xi = x_mat[i, ]
        if(is_short[i]){
          x_fmt[[i]] = gsub(", +NA", "", paste0(xi, collapse = ", "))
        } else {
          x_fmt[[i]] = paste0(xi[1], ", ", xi[2], ", ..., ", xi[3], ", ", xi[4])
        }
      }

      x_names = sfill(names(x))

      res = paste0(x_names, ": ", x_fmt)

      res
    }

    info = lapply(object, extract_and_format)

    cat("Smallest and largest values of the first-order conditions:\n\n")
    var_names = sfill(names(object), right = TRUE)
    intro = sfill("|", n = nchar(var_names[1]) + 1)
    n_var = length(object)
    for(i in 1:n_var){
      cat(var_names[i], "\n")
      value = paste0(intro, info[[i]])
      cat(value, sep = "\n")
      cat(gsub(" ", "_", intro), "\n")

      if(i < n_var) cat("\n")
    }
  }
}






#' Replicates `fixest` objects
#'
#' Simple function that replicates `fixest` objects while (optionally) computing different 
#' standard-errors. Useful mostly in combination with [`etable`] or [`coefplot`].
#'
#' @param x Either a `fixest` object, either a list of `fixest` objects created with `.l()`.
#' @param times Integer vector giving the number of repetitions of the vector of elements. By 
#' default `times = 1`. It must be either of length 1, either of the same length as the argument 
#' `x`.
#' @param each Integer scalar indicating the repetition of each element. Default is 1.
#' @param vcov A list containing the types of standard-error to be computed, default is missing. If 
#' not missing, it must be of the same length as `times`, `each`, or the final vector. Note that if 
#' the arguments `times` and `each` are missing, then `times` becomes equal to the length of 
#' `vcov`. To see how to summon a VCOV, see the dedicated section in the 
#' [vignette](https://lrberge.github.io/fixest/articles/fixest_walkthrough.html#the-vcov-argument-1).
#' @param ... In `.l()`: `fixest` objects. In `rep()`: not currently used.
#'
#' @details
#' To apply `rep.fixest` on a list of `fixest` objects, it is absolutely necessary to use 
#' `.l()` and not `list()`.
#'
#' @return
#' Returns a list of the appropriate length. Each element of the list is a `fixest` object.
#'
#' @examples
#'
#' # Let's show results with different standard-errors
#'
#' est = feols(Ozone ~ Solar.R + Wind + Temp, data = airquality)
#'
#' my_vcov = list(~ Month, ~ Day, ~ Day + Month)
#'
#' etable(rep(est, vcov = my_vcov))
#'
#' coefplot(rep(est, vcov = my_vcov), drop = "Int")
#'
#' #
#' # To rep multiple objects, you need to use .l()
#' #
#'
#' est_bis = feols(Ozone ~ Solar.R + Wind + Temp | Month, airquality)
#'
#' etable(rep(.l(est, est_bis), vcov = my_vcov))
#'
#' # using each
#' etable(rep(.l(est, est_bis), each = 3, vcov = my_vcov))
#'
#'
rep.fixest = function(x, times = 1, each = 1, vcov, ...){
  # each is applied first, then times
  # x can be either a list of fixest objects, either a fixest object

  check_arg(x, "class(fixest, fixest_list) mbt")
  check_arg(times, "integer scalar GE{1} | integer vector no na GE{0}")
  check_arg(each, "integer scalar GE{1} | logical scalar")
  check_arg(vcov, "class(list)")

  if(is_user_level_call()){
    validate_dots(suggest_args = c("times", "each"), stop = TRUE)
  }

  # Checking the arguments
  IS_LIST = FALSE
  if("fixest_list" %in% class(x)){
    IS_LIST = TRUE
    class(x) = "list"

    n = length(x)

  } else {
    n = 1
  }

  if(is.logical(each) && each == FALSE){
    stop("Argument 'each' cannot be equal to FALSE.")
  }

  IS_MULTI_VCOV = !missing(vcov)
  if(IS_MULTI_VCOV){
    n_vcov = length(vcov)

    if(times == 1 && each == 1){
      if(isTRUE(each)){
        each = n_vcov
      } else {
        times = n_vcov
      }

    }
  }

  res_int = rep(1:n, times = times, each = each)
  n_res = length(res_int)

  if(IS_MULTI_VCOV){
    # Checking and expanding

    vcov_mapping = 1:n_res
    if(times == 1){
      if(n_vcov != each && n_vcov != n_res){
        stop("In rep, the argument 'vcov' (currently of length ", n_vcov, ") must be a list either of length ", each, " or of length ", n_res, ".")
      }

      if(n_vcov == each) vcov_mapping = rep(1:each, times = n)

    } else if(each == 1){
      if(n_vcov != times && n_vcov != n_res){
        stop("In rep, the argument 'vcov' (currently of length ", n_vcov, ") must be a list either of length ", times, " or of length ", n_res, ".")
      }

      if(n_vcov == times) vcov_mapping = rep(1:n_vcov, each = n)

    } else {
      if(n_vcov != n_res){
        stop("In rep, the argument 'vcov' (currently of length ", n_vcov, ") must be a list either of length ", n_res, ".")
      }
    }
  }

  res = vector("list", length(res_int))

  if(IS_MULTI_VCOV){
    for(i in 1:n_res){
      if(IS_LIST){
        res[[i]] = summary(x[[res_int[i]]], vcov = vcov[[vcov_mapping[i]]])
      } else {
        res[[i]] = summary(x, vcov = vcov[[vcov_mapping[i]]])
      }
    }

  } else {
    for(i in unique(res_int)){
      if(IS_LIST){
        res[res_int == i] = x[[i]]
      } else {
        res[res_int == i] = list(x)
      }
    }
  }

  for(i in 1:n_res){
    res[[i]]$model_id = res_int[i]
  }

  res
}

#' @rdname rep.fixest
rep.fixest_list = function(x, times = 1, each = 1, vcov, ...){
  rep.fixest(x, times = times, each = each, vcov = vcov, ...)
}

#' @rdname rep.fixest
.l = function(...){

  check_arg(..., "mbt class(fixest) | list")

  dots = list(...)
  if(all(sapply(dots, function(x) "fixest" %in% class(x)))){
    class(dots) = "fixest_list"

    return(dots)
  }

  if(length(dots) == 1){

    if("fixest_multi" %in% class(dots[[1]])){
      res = dots[[1]]
      class(res) = "fixest_list"
      return(res)
    }

    if(all(sapply(dots[[1]], function(x) "fixest" %in% class(x)))){
      res = dots[[1]]
      class(res) = "fixest_list"
      return(res)
    }
  }

  res = list()
  for(i in seq_along(dots)){
    if("fixest" %in% class(dots[[i]])){
      res[[length(res) + 1]] = dots[[i]]
    } else {
      obj = dots[[i]]

      if("fixest_multi" %in% class(obj)){
        for(j in seq_along(obj)){
          res[[length(res) + 1]] = obj[[j]]
        }

      } else {
        for(j in seq_along(obj)){
          if(!"fixest" %in% class(obj[[j]])){
            stop("In .l(...), each argument must be either a fixest object, or a list of fixest objects. Problem: The ", n_th(j), " element of the ", n_th(i), " argument (the latter being a list) is not a fixest object.")
          }

          res[[length(res) + 1]] = obj[[j]]
        }
      }
    }
  }

  class(res) = "fixest_list"
  res
}



#' Retrieves the data set used for a `fixest` estimation
#' 
#' Retrieves the original data set used to estimate a `fixest` or `fixest_multi` model. 
#' Note that this is the original data set and not the data used for the estimation (i.e. it can have more rows).
#' 
#' @param x An object of class `fixest` or `fixest_multi`. For example obtained from [`feols`] or [`feglm`].
#' @param sample Either "original" (default) or "estimation". If equal to "original", 
#' it matches the original data set. If equal to "estimation", the rows of the data set
#' returned matches the observations used for the estimation. 
#' 
#' @return 
#' It returns a data.frame equal to the original data set used for the estimation, when the function was called.
#' 
#' If `sample = "estimation"`, only the lines used for the estimation are returned.
#' 
#' In case of a `fixest_multi` object, it returns the data set of the first estimation object. 
#' So in that case it does not make sense to use `sample = "estimation"` since 
#' the samples may be inconsistent across the different estimations.
#' 
#' @examples 
#' 
#' base = setNames(iris, c("y", "x1", "x2", "x3", "species"))
#' base$y[1:5] = NA
#' 
#' est = feols(y ~ x1 + x2, base)
#' 
#' # the original data set
#' head(fixest_data(est))
#' 
#' # the data set, with only the lines used for the estimation
#' head(fixest_data(est, sample = "est"))
#' 
#' 
fixest_data = function(x, sample = "original"){
  check_arg(x, "class(fixest, fixest_multi) mbt")
  check_set_arg(sample, "match(original, estimation)")
  
  if(inherits(x, "fixest_multi")){
    x = x[[1]]
  }
  
  res = fetch_data(x, prefix = "In `fixest_data()`, ")
  
  if(sample == "estimation"){
    res = res[obs(x), , drop = FALSE]
  }
  
  return(res)
}



#' Controls the parameters of the demeaning procedure
#' 
#' Fine control of the demeaning procedure. Since the defaults are sensible, 
#' only use this function in case of difficult convergence (e.g. in [`feols`] or [`demean`]).
#' That is, look at the slot `$iterations` of the returned object, if it's high (over 50), 
#' then it might be worth playing around with these settings.
#' 
#' @param extraProj Integer scalar, default is 0. Should there be more plain projection steps 
#' in between two accelerations? By default there is not. Each integer value adds 3 
#' simple projections.
#' This can be useful in cases where the acceleration algorithm does not work well 
#' but simple projections do.
#' @param iter_warmup Integer scalar, default is 15. Only used in the presence of 3 
#' or more fixed-effects (FE), ignored otherwise. For 3+ FEs, the algorithm is as follows:
#' 1) `iter_warmup` iterations on all FEs. If convergence: end of the algorithm. 2) Otherwise: 
#' a) demeaning over the first two largest FEs only, until convergence, then 
#' b) demeaning over all FEs until convergence. 
#' To skip the demeaning over 2 FEs, use a very high value of `iter_warmup`. To go directly
#' to the demeaning over 2 FEs, se `iter_warmup` to a value lower than or equal to 0.
#' @param iter_projAfterAcc Integer scalar, default is 40. After `iter_projAfterAcc` iterations
#' of the standard algorithm, a simple projection is performed right after 
#' the acceleration step. Use very high values to skip this step, or low values to apply this 
#' procedure right from the start.
#' @param iter_grandAcc Integer scalar, default is 4. The regular fixed-point algorithm 
#' applies an acceleration at each iteration. This acceleration is for `f(X)` 
#' (with `f` the projection). 
#' This settings controls a grand acceleration, which is instead for `f^k(X)` where
#' `k` is the value of `iter_grandAcc` and `f^2(X)` is defined as `f(f(X))` 
#' (i.e. the function `f` applied `k` times). By default, an additional acceleration 
#' is performed for `h(X) = f^4(X)` every 8 iterations (2 times 4, equivalent to 
#' the iterationsthe time to gather `h(X)` and `h(h(X))`).
#' @param internal Logical scalar, default is `FALSE`. If `TRUE`, no check on the arguments 
#' is performed and the returned object is a plain list. For internal use only.
#' 
#' @details 
#' The demeaning algorithm is a fixed-point algorithm. Basically a function `f` is applied
#' until `|f(X) - X| = 0`, i.e. there is no difference between `X` and its image. 
#' For terminology, let's call the application of `f` a "projection".
#' 
#' For well behaved problems, the algorithm in its simplest form, i.e. just applying `f` until
#' convergence, works fine and you only need a few iterations to reach convergence.
#' 
#' The problems arise for non well behaved problems. In these cases, simply applying the
#' function `f` can lead to extremely slow convergence. To handle these cases, this algorithm
#' applies a fixed-point acceleration algorithm, namely the "Irons and Tuck" acceleration. 
#' 
#' The main algorithm combines regular projections with accelerations. Unfortunately 
#' sometimes this is not enough, so we also resort on internal cuisine, detailed below.
#' 
#' Sometimes the acceleration in its simplest form does not work well, and garbles the 
#' convergence properties. In those cases:
#' - the argument `extraProj` adds several standard projections in between two accelerations,
#' which can improve the performance of the algorithm. By default there are no extra 
#' projections. Note that while it can reduce the total number of iterations until convergence, 
#' each iterations is almost twice expensive in terms of computing time.
#' - the argument `iter_projAfterAcc` controls whether, and when, to apply a simple projection
#' right after the acceleration step. This projection adds roughly a 33% increase in 
#' computing time per iteration but can improve the convergence properties and speed. By default
#' this step starts at iteration 40 (when the convergence rate is already not great).
#' 
#' On top of this, in case of very difficult convergence, a "grand" acceleration is added to 
#' the algorithm. The regular acceleration is over `f`. Say `g` is the function equivalent to
#' the application of one regular iteration (which is a combination of one acceleration with 
#' several projections).
#' By default the grand acceleration is over `h = g o g o g o g`, otherwise `g` applied four times. 
#' The grand acceleration is controled with the argument `iter_grandAcc` which corresponds 
#' to the number of iterations of the regular algorithm defining `h`.
#' 
#' Finally in case of 3+ fixed-effects (FE), the convergence in general takes more iterations.
#' In cases of the absence of quick convergence, applying a first demeaning over the first
#' two largest FEs before applying the demeaning over all FEs can improve convergence speed.
#' This is controlled with the argument `iter_warmup` which gives the number of iterations
#' over all the FEs to run before going to the 2 FEs demeaning. By default, the deameaning 
#' over all FEs is run for 15 iterations before switching to the 2 FEs case.  
#' 
#' The above defaults are the outcome of extended empirical applications, and try to strike a 
#' balance across a majority of cases. Of course you can always get better results by tailoring
#' the settings to your problem at hand.
#' 
#' @references 
#' B. M. Irons, R. Tuck, "A version of the Aitken accelerator for computer iteration",
#' International journal of numerical methods in engineering 1 (1969) 670 275–277.
#' 
#' 
#' @return 
#' This function returns a list of 4 integers, equal to the arguments passed by the user. 
#' That list is of class `demeaning_algo`.
#' 
#' 
#' 
demeaning_algo = function(extraProj = 0, iter_warmup = 15, iter_projAfterAcc = 40, 
                          iter_grandAcc = 4, internal = FALSE){
  
  if(internal){
    return(list(extraProj = extraProj, iter_warmup = iter_warmup, 
                iter_projAfterAcc = iter_projAfterAcc, iter_grandAcc = iter_grandAcc))
  }
  
  check_arg("integer scalar", extraProj, iter_warmup, iter_projAfterAcc, iter_grandAcc)
  
  res = list(extraProj = extraProj, iter_warmup = iter_warmup, 
             iter_projAfterAcc = iter_projAfterAcc, iter_grandAcc = iter_grandAcc)
  class(res) = "demeaning_algo"
  
  res
}

#### ------------- ####
#### Internal Funs ####
####



as.character.formula = function(x, ...) as.character.default(x, ...)

parse_macros = function(..., reset = FALSE, from_xpd = FALSE, check = TRUE, frame = NULL){
  set_up(1)

  if(check){
    check_arg(..., "dotnames os formula | character vector no na | numeric scalar | class(call, name)", .message = paste0("Each element of '...' must be a one-sided formula, and the name of each argument must start with two dots (ex: ", ifelse(from_xpd, "xpd(fml, ..ctrl = ~ x5 + x6)", "setFixest_fml(..ctrl = ~ x5 + x6)"), ").\nAlternatively it can be a character vector of variable names, or a numeric scalar."))
  }

  # Original macros
  fml_macro = getOption("fixest_fml_macro")
  if(reset || is.null(fml_macro)){
    fml_macro = list()
  } else if(!is.list(fml_macro)){
    warn_up("The value of getOption(\"fixest_fml_macro\") wasn't legal, it has been reset.")
    fml_macro = list()
    options(fixest_fml_macro = list())
  }

  # We check the names
  if(...length() == 0) return(fml_macro)

  dots = list(...)

  # Setting the new macros
  if(check){
    qui_pblm = !grepl("^\\.\\.", names(dots))
    if(any(qui_pblm)){
      arg_name = names(dots)[qui_pblm][1]
      correct_name = gsub("^\\.*", "\\.\\.", arg_name)
      stop_up("Each argument name must start with two dots. Use {q?correct_name} instead of {q?arg_name}.")
    }
  }

  for(v in names(dots)){
    fml_macro[[v]] = value2stringCall(dots[[v]], check = check, frame = frame)
  }

  fml_macro
}

value2stringCall = function(value_raw, call = FALSE, check = FALSE, frame = NULL){

  if(any(c("call", "name") %in% class(value_raw))){
    res = if(call) value_raw else deparse_long(value_raw)

  } else if(inherits(value_raw, "formula")){
    res = if(call) value_raw[[2]] else as.character(value_raw)[[2]]

  } else {

    if(check){
      value_raw = grep("[[:alnum:]]", value_raw, value = TRUE)
      if(length(value_raw)){
        # We need to check that it leads to a valid formula => otherwise problems later

        for(i in seq_along(value_raw)){
          value_raw[i] = dot_square_bracket(value_raw[i], frame)
        }

        value_raw = paste(value_raw, collapse = " + ")

        my_call = error_sender(str2lang(value_raw), "The value '", value_raw, 
                               "' does not lead to a valid formula: ", up = 2)
        res = if(call) my_call else value_raw

      } else {
        res = if(call) 1 else "1"
      }

    } else {
      value_raw = value_raw[nzchar(value_raw)]
      if(length(value_raw)){
        value_raw = paste(value_raw, collapse = "+")
        res = if(call) str2lang(value_raw) else value_raw
      } else {
        res = if(call) 1 else "1"
      }
    }
  }

  if(check && "[" %in% all.vars(res, functions = TRUE)){
    res_txt = dot_square_bracket(deparse_long(res), frame)
    res = as.formula(res_txt)
  }

  res
}

dot_square_bracket = function(x, frame = .GlobalEnv, regex = FALSE, text = FALSE,
                              forced_merge = FALSE, up = 0){
  # transforms "x.[i]" into x1 if i==1
  # z = "XX" ; x = ".[z] + x.[1:5] + y.[1:2]_t"
  # x = "x.[a] | fe[b] + m.[o]"
  # .[,stuff] => means aggregation is comma based
  # if text, we allow:
  # - before.["text", stuff]after AND .[stuff, "text"]
  #   * .["text", stuff] => collapses stuff with "text"
  #     ie paste0(before, paste0(stuff, collapse = "text"), after)
  #   * .["text", stuff] => collapses stuff WITH the previous string with "text",
  #     ie paste0(paste0(before, stuff, collapse = "text), after)
  # regex = FALSE ; frame = .GlobalEnv ; text = FALSE
  # x = 'c(.[,y]) ~ sw(x.[,1:3])'
  # name = c("Jon", "Juliet") ; x = "bonjour .[' and ', name]" ; text = TRUE
  # name = c("Jon", "Juliet") ; x = "bonjour .[name, ' and ']" ; text = TRUE

  if(!grepl(".[", x, fixed = TRUE)) return(x)

  x_split_open = strsplit(x, ".[", fixed = TRUE)[[1]]

  # comma aggregation
  is_comma = grepl("^,", x_split_open[-1])
  if(any(is_comma) && !text){
    x_split_open[-1] = gsub("^, *", "", x_split_open[-1])
  }

  # nesting: a ~ .["x.[1:2]_sq"] + b
  any_nested = any(grepl("^\"", x_split_open[-1])) && !text
  if(any_nested){

    i_open_quote = setdiff(which(grepl("^\"", x_split_open)), 1)
    i_close_quote = which(grepl("\"\\]", x_split_open))

    x_split_new = x_split_open[1]
    for(i in i_open_quote){
      j = i_close_quote[i_close_quote >= i]

      xi_new_all = x_split_open[i:j]
      xi_new_all = gsub("\\]", "__close__", xi_new_all)

      xi_new = paste0(xi_new_all, collapse = "__open__")
      xi_new = gsub("\"__close__", "\"]", xi_new)

      x_split_new[[length(x_split_new) + 1]] = xi_new
    }

    n = length(x_split_open)
    if(j < n){
      for(i in (j+1):n){
        x_split_new[[length(x_split_new) + 1]] = x_split_open[i]
      }
    }

    x_split_open = x_split_new
  }

  # Finding the elements left/right of the closing bracket
  # we take care of indexing within brackets (e.g. x1 + .[m[1]])
  b_open = gregexpr("[", x_split_open[-1], fixed = TRUE)
  b_close = gregexpr("]", x_split_open[-1], fixed = TRUE)

  x_split = character(length(x_split_open) * 2 - 1)
  n = length(x_split)
  x_split[1] = x_split_open[1]

  for(i in seq_along(b_open)){

    if(b_open[[i]][1] != -1){
      # means there was an open [
      n_extend = length(b_close[[i]]) - length(b_open[[i]])
      index_closing = b_close[[i]][which.max(b_close[[i]] < c(b_open[[i]], rep(Inf, n_extend)))]
    } else {
      index_closing = b_close[[i]][1]
    }

    x_SO = x_split_open[i + 1]
    x_split_close_left = substr(x_SO, 1, index_closing - 1)
    x_split_close_right = substr(x_SO, index_closing + 1, nchar(x_SO))

    j = 2 * i
    x_split[[j]] = x_split_close_left
    x_split[[j + 1]] = x_split_close_right
  }

  if(any_nested){
    x_split = gsub("__open__", ".[", x_split)
    x_split = gsub("__close__", "]", x_split)
  }

  if(text){
    # NA means no operation
    operator = rep(NA_character_, n)
    do_split = rep(FALSE, n)
  }

  res = as.list(x_split)
  for(i in (1:n)[(1:n) %% 2 == 0]){

    # catching the operation
    do_lang = TRUE
    if(text){

      x_txt = trimws(x_split[i])

      op = NULL
      is_agg = is_split = FALSE
      if(grepl("^,", x_txt)){
        # default value of the aggregation
        op = ''
        x_txt = str_trim(x_txt, 1)
        is_agg = TRUE
      } else if(grepl("^/", x_txt)){
        # default value of the split
        op = "@, *"
        x_txt = str_trim(x_txt, 1)
        is_split = TRUE
      }

      if(!is_split && !is_agg){
        info_op = regexpr("^(('[^']*')|(\"[^\"]*\"))[,/]", x_txt)
        if(info_op != -1){
          arg_pos = attr(info_op, "match.length")
          op = substr(x_txt, 2, arg_pos - 2)
          arg = substr(x_txt, arg_pos, arg_pos)
          x_txt = str_trim(x_txt, arg_pos)

          is_agg = arg == ","
          is_split = !is_agg
        }
      }

      if(is_split){
        do_lang = FALSE

        operator[i] = op
        do_split[i] = TRUE
        my_call = x_txt

      } else if(is_agg){
        do_lang = FALSE

        x_txt = paste0("dsb_check_set_agg(", x_txt, ")")
        my_list = list(dsb_check_set_agg = dsb_check_set_agg)
        my_call = error_sender(eval(str2lang(x_txt), my_list, frame),
                               "Dot square bracket operator: Evaluation of `.[",
                               x_split[i], "]` led to an error:",
                               up = up + 1)

        operator[i] = op
      }
    }


    if(do_lang){
      my_call = error_sender(str2lang(x_split[i]),
                             "Dot square bracket operator: Evaluation of `.[",
                             x_split[i], "]` led to an error:",
                             up = up + 1)
    }

    if(is.character(my_call) && grepl(".[", my_call, fixed = TRUE)){
      # Nested call
      value = .dsb(my_call, frame = frame)

    } else {
      # Informative error message
      value = error_sender(as.character(eval(my_call, frame)),
                           "Dot square bracket operator: Evaluation of `.[",
                           x_split[i], "]` led to an error:",
                           up = up + 1)

      if(length(value) == 2 && value[1] == "~"){
        value = char_to_vars(value[2])
      }
    }

    if(length(value) == 0 || (is.character(value) && length(value) == 1 && nchar(value) == 0)){
      # Neutral element in formulas
      value = "1"
    }

    res[[i]] = value
  }

  if(any(is_comma)){
    is_comma = insert_in_between(FALSE, is_comma)
  } else if(!text){
    is_comma = rep(FALSE, length(res))
  }

  if(max(lengths(res)) == 1) {

    if(text && any(do_split)){
      res_txt = res[[1]]
      for(i in 2:length(res)){

        if(do_split[i]){
          res_txt = paste0(res_txt, str_split(res[[i]], operator[i])[[1]])
        } else {
          res_txt = paste0(res_txt, res[[i]])
        }
      }

      res = res_txt

    } else {
      res = paste(res, collapse = "")
    }

  } else {
    # first value is NEVER a vector ("" is added automatically in split)
    res_txt = res[[1]]
    i = 2
    while(i <= n){

      if(length(res[[i]]) == 1){

        if(text && do_split[i]){
          res_txt = paste0(res_txt, str_split(res[[i]], operator[i])[[1]])
        } else {
          res_txt = paste0(res_txt, res[[i]])
        }

        i = i + 1

      } else if(text){
        if(is.na(operator[i])){
          if(forced_merge){
            res_txt = paste0(res_txt, paste0(res[[i]], collapse = "_MERGE_"))
          } else {
            res_txt = paste0(res_txt, res[[i]])
          }

        } else {
          # If we're here => must be an aggregation requested
          # (if it was a split, it would be of length 1)
          res_txt = paste0(res_txt, paste0(res[[i]], collapse = operator[i]))
        }

        i = i + 1

      } else if(regex) {
        after = if(i != n) res[[i + 1]] else ""
        res_txt = paste0(res_txt, res[[i]], after, collapse = "|")
        i = i + 2

      } else {
        before_no_var = gsub("[[:alnum:]_\\.]+$", "", res_txt)
        var_before = substr(res_txt, nchar(before_no_var) + 1, nchar(res_txt))

        coll = if(is_comma[i]) ", " else " + "

        if(i != n && grepl("[[:alnum:]_\\.]", substr(res[[i + 1]], 1, 1))){
          after = res[[i + 1]]
          after_no_var = gsub("^[[:alnum:]_\\.]+", "", after)
          var_after = substr(after, 1, nchar(after) - nchar(after_no_var))

          res_txt = paste0(before_no_var, 
                           paste0(var_before, res[[i]], var_after, collapse = coll), 
                           after_no_var)
          i = i + 2
        } else {
          res_txt = paste0(before_no_var, paste0(var_before, res[[i]], collapse = coll))
          i = i + 1
        }
      }
    }

    res = res_txt
  }


  # Recursivity prevention: no recursivity if text = TRUE
  DSB_RECURSIVE = TRUE
  is_rec = exists("DSB_RECURSIVE", parent.frame(), inherits = FALSE)

  if(!text && !is_rec && grepl(".[", res, fixed = TRUE)){
    res = dot_square_bracket(res, frame, regex)
  }

  res
}


dsb_check_set_agg = function(...){
  # Here we should have only one element, everything has been cleaned up before

  if(...length() > 1){
    if(...length() == 2){
      stop_up("The operator .[] accepts only up to two elements, if so the first one MUST be a string literal (eg: .['text', y]). Problem: it is not a string literal.")
    }
    stop_up("You cannot have more than two elements in between .[]. Currently there are ", 
            ...length() + 1, ".")
  }

  mc = match.call()

  return(mc[[2]])
}

print_coeftable = function(coeftable, lastLine = "", show_signif = TRUE){
  # Simple function that does as the function coeftable but handles special cases
  # => to take care of the case when the coefficient is bounded

  if(!is.data.frame(coeftable)){
    class(coeftable) = NULL
    ct = as.data.frame(coeftable)
  } else {
    ct = coeftable
  }

  signifCode = c("***"=0.001, "** "=0.01, "*  "=0.05, ".  "=0.1)

  pvalues = ct[, 4]

  stars = cut(pvalues, breaks = c(-1, signifCode, 100), labels = c(names(signifCode), ""))
  stars[is.na(stars)] = ""

  whoIsLow = !is.na(pvalues) & pvalues < 2.2e-16

  # Note that it's a bit different than format => I don't like xxe-yy numbers, very hard to read: you can't see large/small nbers at first sight
  for(i in 1:3){
    ct[, i] = decimalFormat(ct[, i])
  }

  ct[!whoIsLow, 4] = format(ct[!whoIsLow, 4], digits = 5)

  ct[whoIsLow, 4] = "< 2.2e-16"
  ct[is.na(ct[, 4]), 4] = "NA"

  ct[, 5] = stars
  names(ct)[5] = ""

  print(ct)

  cat(lastLine)

  if(show_signif){
    cat("---\nSignif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n")
  }
}

plot_single_cluster = function(x, n=5, addExp = FALSE, fe_name, ...){
  # It plots the n first and last most notable FEs

  isSlope = grepl("\\[", fe_name)

  # we compare with the average of the coefficients
  if(isSlope == FALSE) x = sort(x) - mean(x)

  x_name = names(x)

  k = length(x)
  nb_show = min(k, 2*n+1)
  mid_value = floor(nb_show/2)
  xlim = c(1, nb_show)
  xlim = xlim + c(-1, 1) * diff(xlim)/30

  if(k <= 2*n+1){
    # no need of space to split the data
    x_values = 1:k
    y_values = x
    isSplit = FALSE
  } else {
    nb_show = nb_show - 1 # because we don't want to show the middle point
    x_values = c(1:n, (n+2):(2*n+1))
    y_values = c(head(x, n), tail(x, n))
    isSplit = TRUE
  }

  # very specific case where the axis confonds with the boxing
  ylim = range(y_values)
  if(pretty(range(y_values))[1] < min(y_values) || tail(pretty(range(y_values)), 1) > max(y_values)){
    ylim = range(y_values) + c(-1, 1) * diff(range(y_values))/30
  }

  plot(x_values, y_values, ann = FALSE, axes = FALSE, xlim=xlim, ylim = ylim, col = 0)

  # display
  box()
  y = axis(2)
  abline(h = y, lty = 4, col = "lightgrey")

  # name information & points
  points(x_values, y_values)
  text(head(x_values, mid_value), head(y_values, mid_value), head(x_name, mid_value), pos = 3)
  text(tail(x_values, nb_show-mid_value), 
       tail(y_values, nb_show-mid_value), 
       tail(x_name, nb_show-mid_value), pos = 1)

  if(isSlope){
    fe = gsub("\\[.+", "", fe_name)
    slope = gsub(".+\\[|\\].+", "", fe_name)
    title(xlab = paste0("Slope Coefficients (", fe, ")"), line = 1)
    title(main = slope)
  } else {
    title(xlab = "Centered Fixed-Effects", line = 1)
    title(main = fe_name)
  }


  if(isSplit){
    abline(v = c(n+0.75, n+1.25), lty = 2)

    axis(1, at = c(n+0.75, n+1.25), col = "grey", labels = NA, lwd.ticks = 0)
    axis(3, at = c(n+0.75, n+1.25), col = "grey", labels = NA, lwd.ticks = 0)
  }

  coord = par("usr")
  # axis(1, at = coord[1:2], labels = c("coef", "exp(coef)"), lwd = 0, lwd.ticks = 1)

  if(addExp && !isSlope){
    axis(1, at = coord[1], labels = "coef", lwd = 0, lwd.ticks = 1, line = -1)
    axis(1, at = coord[2], labels = "exp(coef)", lwd = 0, lwd.ticks = 1, line = -1)
    axis(4, at=y, labels = signif(exp(y), 2))
  }

}

prepare_matrix = function(fml, base, fake_intercept = FALSE){
  # This function is way faster than model.matrix but does not accept factors
  # The argument fml **MUST** not have factors!

  rhs = if(length(fml) == 3) fml[c(1, 3)] else fml

  t = terms(rhs, data = base)

  all_var_names = attr(t, "term.labels")

  # We take care of interactions: references can be multiple, then ':' is legal
  all_vars = cpp_colon_to_star(all_var_names)

  # Forming the call
  if(attr(t, "intercept") == 1 && !fake_intercept){
    n = nrow(base)
    all_vars_call = str2lang(paste0("list('(Intercept)' = rep(1, ", n, "), ", 
                                    paste0(all_vars, collapse = ", "), ")"))
    all_var_names = c("(Intercept)", all_var_names)
  } else {
    all_vars_call = str2lang(paste0("list(", paste0(all_vars, collapse = ", "), ")"))
  }

  # evaluation
  data_list = eval(all_vars_call, base)

  # Handling the multi columns case (ex: bs(x1), splines)
  # NOTA: I need to add a check for i() because of 1 value interactions
  #       not caught by the != nber of obs
  qui_i = grepl("\\bi\\(", all_var_names)
  if(any(lengths(data_list) != nrow(base)) || any(qui_i)){

    all_n = as.vector(lengths(data_list) / nrow(base))

    qui_pblm = which(all_n %% 1 != 0)
    if(length(qui_pblm) > 0){
      what = data_list[[qui_pblm]]
      reason = ifelse(is.null(nrow(what)), 
                      paste0("of length ", length(what)), 
                      paste0("with ", nrow(what), " rows"))

      stopi("Evaluation of {bq?all_var_names[qui_pblm]}, returns an object {reason}", 
            " while the data set has {n ? nrow(base)} rows.", )
    }

    all_n_vector = rep(all_n, all_n)

    new_names = as.list(all_var_names)
    for(i in which(all_n > 1 | qui_i)){
      my_names = colnames(data_list[[i]])
      if(is.null(my_names)){
        my_names = 1:all_n[i]
      }
      new_names[[i]] = paste0(all_var_names[i], my_names)
    }

    all_var_names = unlist(new_names)
  }

  res = do.call("cbind", data_list)

  colnames(res) = all_var_names

  res
}


fixest_model_matrix = function(fml, data, fake_intercept = FALSE, i_noref = FALSE, mf = NULL){
  # This functions takes in the formula of the linear part and the
  # data
  # It reformulates the formula (ie with lags and interactions)
  # then either apply a model.matrix
  # either applies an evaluation (which can be faster)
  #
  # fake_intercept => whether to add the intercept, only to make sure
  #  the factors are well created

  # fml = ~a*b+c+i(x1)+Temp:i(x2)+i(x3)/Wind
  
  if(!identical(class(data), "data.frame")){
    # when this function is called from within the estimations, 
    # data is always a pure DF
    #
    # to avoid bugs when this function is called from elsewhere, we 
    # ensure the data is always a pure DF
    class(data) = "data.frame"
  }

  if(length(fml) == 3) fml = fml[c(1, 3)]

  # We need to check a^b otherwise error is thrown in terms()
  rhs_txt = deparse_long(fml[[2]])
  if(grepl("\\^[[:alpha:]]", rhs_txt)){
    stop("The special operator `^` can only be used in the fixed-effects part of the formula. Please use `:` instead.")
  }

  #
  # Evaluation
  #

  t_fml = terms(fml)
  tl = attr(t_fml, "term.labels")

  if(length(tl) == 0){

    if(fake_intercept){
      return(1)
    }

    res = matrix(1, nrow = nrow(data), ncol = 1, dimnames = list(NULL, "(Intercept)"))
    return(res)
  }

  # We check for calls to i()
  qui_i = grepl("(^|[^[:alnum:]_\\.])i\\(", tl)
  IS_I = any(qui_i)
  if(IS_I){
    # OMG... why do I always have to reinvent the wheel???
    is_intercept = fake_intercept || (attr(t_fml,"intercept") == 1)
    i_naked = which(is_naked_fun(tl[qui_i], "i"))

    if(i_noref){
      for(i in seq_along(i_naked)){
        j = i_naked[i]
        txt = gsub("(^|(?<=[^[:alnum:]\\._]))i\\(", "i_noref(", tl[qui_i][j], perl = TRUE)
        tl[qui_i][j] = eval(str2lang(txt))
      }
    } else {
      for(i in seq_along(i_naked)){
        if(!is_intercept && i == 1) next

        j = i_naked[i]
        txt = gsub("(^|(?<=[^[:alnum:]\\._]))i\\(", "i_ref(", tl[qui_i][j], perl = TRUE)
        tl[qui_i][j] = eval(str2lang(txt))
      }
    }

    fml_no_inter = .xpd(rhs = tl[!qui_i])

    if(!is_intercept) tl = c("-1", tl)
    fml = .xpd(rhs = tl)

  }

  # Are there factors NOT in i()? If so => model.matrix is used
  dataNames = names(data)
  if(!is.null(mf)){
    useModel.matrix = TRUE
  } else {
    useModel.matrix = FALSE
    if(IS_I){
      linear.varnames = all.vars(fml_no_inter[[2]])
      is_num = sapply(data[, dataNames %in% linear.varnames, FALSE], is.numeric)
      if(length(is_num) > 0 && (any(!is_num) || grepl("factor", deparse_long(fml_no_inter)))){
        useModel.matrix = TRUE
      }
    } else {
      linear.varnames = all.vars(fml[[2]])
      is_num = sapply(data[, dataNames %in% linear.varnames, FALSE], is.numeric)
      if(length(is_num) == 0 || any(!is_num) || grepl("factor", deparse_long(fml))){
        useModel.matrix = TRUE
      }
    }
  }

  if(useModel.matrix){
    # to catch the NAs, model.frame needs to be used....

    if(is.null(mf)){
      mf = stats::model.frame(fml, data, na.action = na.pass)
    } else {
      # => predict, newdata
      # case i() + anything that requires evaluation based on the raw data (poly, factor, etc)
      # is there any drawback?
      fml = formula(mf)
      if(length(fml) == 3){
        # bug R 3.5.0
        fml_new = update(fml, .~xxxxxx + .)
        fml_new[[3]][[2]] = fml[[2]]
        fml_new[[2]] = NULL
        fml = fml_new
      }
    }

    linear.mat = stats::model.matrix(fml, mf)

    if(fake_intercept){
      who_int = which("(Intercept)" %in% colnames(linear.mat))
      if(length(who_int) > 0){
        linear.mat = linear.mat[, -who_int, drop = FALSE]
      }
    }
  } else {
    linear.mat = prepare_matrix(fml, data, fake_intercept)
  }

  if(any(grepl("__CLEAN__", colnames(linear.mat), fixed = TRUE))){
    new_names = clean_interact_names(colnames(linear.mat))

    colnames(linear.mat) = new_names
  }

  if(is.integer(linear.mat)){
    linear.mat = 1 * linear.mat
  }

  linear.mat
}


fixest_model_matrix_extra = function(object, newdata, original_data, fml, 
                                     fake_intercept = FALSE, i_noref = FALSE, subset = FALSE){
  # Only used within model.matrix and predict
  # Overlay of fixest_model_matrix to take care of special things, eg:
  # - poly
  # - subset
  # - ?

  #
  # subset
  #

  # subset => we allow the extraction of only some variables
  all_vars = all_vars_with_i_prefix(fml[[3]])
  if(isFALSE(subset)){

    if(!original_data && any(!all_vars %in% names(newdata))){
      pblm = setdiff(all_vars, names(newdata))
      stop("In `model.matrix`, the variable", enumerate_items(pblm, "is.s.quote"), " in the formula but not in the argument `data`. Use `subset = TRUE` to enable the creation of partial data.")
    }

  } else {
    vars_keep = names(newdata)

    if(is.character(subset)){
      # ex: subset = c("x1$", "x2$")

      vars_keep = keep_apply(vars_keep, subset)
      if(length(vars_keep) == 0){
        stop("The variables in `subset` do not match any variable in the `data`.")
      }

      if(isFALSE(keep_apply("(Intercept)", subset, logical = TRUE))){
        fake_intercept = TRUE
      }

    } else {
      # intercept always removed if subset = TRUE!!!
      # that's the point of subset.

      fake_intercept = TRUE
    }

    if(!all(all_vars %in% vars_keep)){

      terms_all = attr(terms(fml), "term.labels")
      # We first check pure variables (because non pure variables are slower to check)
      is_var = grepl("^[\\.[:alpha:]][[:alnum:]\\._]*$", terms_all)
      terms_drop = is_var & !terms_all %in% vars_keep

      for(i in which(!is_var)){
        if(any(!all_vars_with_i_prefix(str2lang(terms_all[i])) %in% vars_keep)){
          terms_drop[i] = TRUE
        }
      }

      if(all(terms_drop)){
        stop("Due to the use of the argument `subset`, not a single variable is left.")
      }

      fml = .xpd(rhs = terms_all[!terms_drop])
    }
  }

  #
  # Extra functions that need raw data-evaluation + single valued factors
  #

  mf = NULL
  if(!original_data){

    # What I don't like here is:
    # to take care of only two particular cases, namely
    #  + functions using the original data
    #  + single value factors
    # we incur a lot of computing time to EVERYONE
    # That's really not good.
    # The problem is that it's not so easy to catch those cases 100% without error.
    # I shall find a solution at some point.

    # if lean = TRUE, we should be avoiding that
    # => I don't know of a solution yet...

    if(length(fml) == 3) fml = fml[c(1, 3)]

    # We apply model.frame to the original data
    data = fetch_data(object, "To apply `model.matrix.fixest`, ")

    panel__meta__info = set_panel_meta_info(object, data)

    mf = model.frame(fml, data, na.action = na.pass)

    rm(panel__meta__info) # needed, so that when the data is recreated for real

    t_mf = terms(mf)
    xlev = .getXlevels(t_mf, mf)

    if(!identical(attr(t_mf,"variables"), attr(t_mf,"predvars")) || length(xlev) > 0){
      mf = model.frame(t_mf, newdata, xlev = xlev, na.action = na.pass)
    } else {
      mf = NULL
    }
  }

  GLOBAL_fixest_mm_info = list()

  I_IGNORE_ERRORS = TRUE

  new_matrix = fixest_model_matrix(fml, newdata, fake_intercept, i_noref, mf = mf)
  
  if(R.Version()$major == "3"){
    colnames(new_matrix) = gsub("`", "", colnames(new_matrix), fixed = TRUE)
  }

  if(length(GLOBAL_fixest_mm_info) > 0){
    attr(new_matrix, "model_matrix_info") = GLOBAL_fixest_mm_info
  }

  new_matrix
}





fixef_terms = function(fml, stepwise = FALSE, origin_type = "feols"){
  # separate all terms of fml into fixed effects and varying slopes
  # fml: one sided formula
  # can also be a vector of terms
  # Je fais tout ce tralala a cause de ce putain de terms() qui ne marche pas avec a^b!!!! fait chier!

  # fixef_terms(~dum_1[[x1]] + dum_2[x2])
  # fixef_terms(~dum_1[[x1]] + dum_2 + dum_3[x2, x3])

  if(!is.vector(fml)){
    # we need to take care of the ^ used to combine variables
    fml_char = as.character(fml[2])
    if(grepl("^", fml_char, fixed = TRUE)){
      fml_char_new = gsub("^", "%^%", fml_char, fixed = TRUE)
      fml = as.formula(paste0("~", fml_char_new))
    }

    if(!origin_type %in% c("feols", "feglm") && grepl("[", fml_char, fixed = TRUE)){
      stop("The use of varying slopes is available only for the functions feols, feglm or fepois.")
    }

    t = terms(fml)

    if(stepwise){
      sw_info = extract_stepwise(tms = t)
      return(sw_info)
    }

    my_vars = attr(t, "term.labels")

  } else {
    my_vars = fml
  }

  # Further error checking (misuse of varying slopes)
  if(any(grepl("]", my_vars, fixed = TRUE))){
    var2check = my_vars[grepl("]", my_vars, fixed = TRUE)]
    var2check_double = var2check[grepl("]]", var2check, fixed = TRUE)]
    var2check_single = var2check[!grepl("]]", var2check, fixed = TRUE)]
    if(length(var2check_double) > 0){
      qui = !grepl("\\]\\]$", var2check_double) | !grepl("\\[\\[", var2check_double) | lengths(strsplit(var2check_double, "\\[")) != 3
      if(any(qui)){
        item_pblm = var2check_double[qui]
        msg = sma("Square bracket are special characters used **only** to designate ",
                  "varying slopes (see help). ",
                  "They are currenlty misused (it concerns {$enum.bq?item_pblm}).")
        class(msg) = "try-error"
        return(msg)
      }
    }

    if(length(var2check_single) > 0){
      qui = !grepl("\\]$", var2check_single) | lengths(strsplit(var2check_single, "\\[")) != 2
      if(any(qui)){
        item_pblm = var2check_single[qui]
        msg = sma("Square bracket are special characters used **only** to designate ",
                  "varying slopes (see help). ",
                  "They are currenlty misused (it concerns {$enum.bq?item_pblm}).")
        class(msg) = "try-error"
        return(msg)
      }
    }
  }

  # And yet again some error checking => i() should NOT be used
  if(any(grepl("^i\\(", my_vars))){
    # We create an error instead of simply correcting the syntax => this is because the function i is
    # very different and should not be confused

    var_pblm = my_vars[grepl("^i\\(", my_vars)][1]

    get_new_var = function(var, f, f2, ...) match.call()

    what = eval(str2lang(gsub("^i", "get_new_var", var_pblm)))
    n_var = sum(c("var", "f", "f2") %in% names(what))
    
    if(n_var == 1){
      msg = "Using i() to create fixed-effects is not possible, use directly the variable."
    } else {
      msg = sma("To interact fixed-effects, use the syntax fe1^fe2 ",
                "(in your case {deparse(what[[2]])}^{deparse(what[[3]])}).")
    }

    stop("The function i() should not be used in the fixed-effects part of the formula. ", msg)
  }

  # Internal function
  reformulate_varslope = function(dum, ..., add_dum = TRUE){
    mc = match.call()
    dum = deparse_long(mc[["dum"]])

    if(add_dum){
      res = dum
    } else {
      res = c()
    }

    for(i in 3:(length(mc) - !add_dum)){
      value = deparse_long(mc[[i]])
      res = c(res, paste0(dum, "[[", value, "]]"))
    }
    res
  }

  qui_slope = grepl("[", my_vars, fixed = TRUE)
  if(any(qui_slope)){
    new_terms = c()

    vars_slope = my_vars[qui_slope]
    for(i in seq_along(my_vars)){
      if(qui_slope[i]) {
        v = my_vars[i]
        v_mod = paste0("reformulate_varslope(", gsub("\\[+", ", ", v))
        add_dum = ifelse(grepl("\\[\\[", v), ", add_dum = FALSE", "")
        v_mod = gsub("\\]+", paste0(add_dum, ")"), v_mod)

        new_v = eval(str2lang(v_mod))
        new_terms = c(new_terms, new_v)
      } else {
        new_terms = c(new_terms, my_vars[i])
      }
    }

    my_vars = new_terms
  }

  # we return fml_terms, fe_vars, slope_vars, slope_flag

  # we need to unique it
  my_vars = unique(my_vars)

  # we put the ^ back
  my_vars = gsub(" %^% ", "^", my_vars, fixed = TRUE)

  res = list(fml_terms = my_vars)
  # OLD version
  fe_vars_all = gsub("\\[.+", "", my_vars)
  is_slope_all = grepl("\\[", my_vars)
  slope_vars_all = rep(NA_character_, length(my_vars))
  slope_vars_all[is_slope_all] = gsub(".+\\[|\\]", "", my_vars[is_slope_all])

  # New version following the reworking of demeaning
  fe_vars = unique(fe_vars_all)
  #fe_vars: unique of the fixed-effects variables
  slope_flag = rep(0, length(fe_vars))
  # slope_flag: 0: no Varying slope // > 0: varying slope AND fixed-effect // < 0: varying slope WITHOUT fixed-effect

  slope_vars_list = vector("list", length(fe_vars))
  names(slope_vars_list) = fe_vars
  for(i in seq_along(fe_vars)){
    qui = which(fe_vars_all == fe_vars[i])
    if(any(is_slope_all[qui])){
      nb = sum(is_slope_all[qui])
      slope_flag[i] = nb * (1 - 2*(length(qui) == nb))
      slope_vars_list[[i]] = slope_vars_all[qui][is_slope_all[qui]]
    }
  }

  res$fe_vars = fe_vars
  res$slope_flag = as.integer(slope_flag)
  res$slope_vars_list = slope_vars_list
  res$slope_vars = unlist(slope_vars_list, use.names = FALSE)

  res
}

prepare_df = function(vars, data, fastCombine = NA){
  # vars: vector of variables to evaluate
  # data: a data.frame#
  # note that this is an internal function and the data should be checked beforehand

  # we drop NAs and make it unique
  vars = unique(vars[!is.na(vars)])
  all_var_names = vars

  do_combine = !is.na(fastCombine)

  changeNames = FALSE
  if(do_combine && any(grepl("^", vars, fixed = TRUE))){
    # special indicator to combine factors
    # ^ is a special character: only used to combine variables!!!

    vars = fml_combine(vars, fastCombine, vars = TRUE)

    changeNames = TRUE
  }

  all_vars = cpp_colon_to_star(vars)

  if(all(all_vars %in% names(data))){
    # we ensure the data is a pure DF 
    if(!identical(class(data), "data.frame")){
      class(data) = "data.frame"
    }
    res = data[, all_vars, drop = FALSE]
  } else {
    all_vars_call = str2lang(paste0("list(", paste0(all_vars, collapse = ", "), ")"))
    data_list = try(eval(all_vars_call, data))

    # if error: we send it back to the main function
    if("try-error" %in% class(data_list)){
      return(data_list)
    }

    names(data_list) = all_var_names
    data_list$stringsAsFactors = FALSE

    res = do.call("data.frame", data_list)

    if(changeNames){
      all_var_names = rename_hat(all_var_names)
    }

    names(res) = all_var_names
  }

  res
}


# fml_char = "x + y + u^factor(v1, v2) + x5"
fml_combine = function(fml_char, fastCombine, vars = FALSE){
  # function that transforms "hat" interactions into a proper function call:
  # Origin^Destination^Product + Year becomes ~combine_clusters(Origin, Destination, Product) + Year

  fun2combine = ifelse(fastCombine, "combine_clusters_fast", "combine_clusters")

  # we need to change ^ into %^% otherwise terms sends error
  labels = attr(terms(.xpd(rhs = gsub("\\^(?=[^0-9])", "%^%", fml_char, perl = TRUE))), 
                "term.labels")

  # now we work this out
  for(i in seq_along(labels)){
    lab = labels[i]
    if(grepl("^", lab, fixed = TRUE)){
      lab_split = trimws(strsplit(lab, "%^%", fixed = TRUE)[[1]])
      if(grepl("(", lab, fixed = TRUE)){
        # we add some error control -- imperfect, but... it's enough
        lab_collapsed = gsub("\\([^\\)]+\\)", "", lab)
        if(length(lab_split) != length(strsplit(lab_collapsed, "%^%", fixed = TRUE)[[1]])){
          msg = "Wrong formatting of the fixed-effects interactions. The `^` operator should not be within parentheses."
          stop(msg)
        }
      }
      labels[i] = paste0(fun2combine, "(", paste0(lab_split, collapse = ", "), ")")
    }
  }

  if(vars){
    return(labels)
  }

  fml = .xpd(rhs = labels)

  fml
}

# x = c('combine_clusters(bin(fe1, "!bin::2"), fe2)', 'fe3')
rename_hat = function(x){

  qui = grepl("combine_clusters", x, fixed = TRUE)
  if(!any(qui)) return(x)

  for(i in which(qui)){
    xi_new = gsub("combine_clusters(_fast)?", "sw", x[i])
    sw_only = extract_fun(xi_new, "sw")
    sw_eval = eval(str2lang(sw_only$fun))
    new_var = paste0(sw_only$before, paste0(sw_eval, collapse = "^"), sw_only$after)
    x[i] = new_var
  }

  x
}

prepare_cluster_mat = function(fml, data, fastCombine){
  # prepares the data.frame of the cluster variables

  fml_char = as.character(fml[2])
  changeNames = FALSE
  if(grepl("^", fml_char, fixed = TRUE)){
    # special indicator to combine factors
    fml = fml_combine(fml_char, fastCombine)
    changeNames = TRUE
  }

  t = terms(fml, data = data)

  all_var_names = attr(t, "term.labels")
  all_vars = gsub(":", "*", all_var_names)

  if(all(all_vars %in% names(data))){
    # we ensure the data is a pure DF 
    if(!identical(class(data), "data.frame")){
      class(data) = "data.frame"
    }
    res = data[, all_vars, drop = FALSE]
  } else {
    all_vars_call = str2lang(paste0("list(", paste0(all_vars, collapse = ", "), ")"))
    data_list = eval(all_vars_call, data)
    names(data_list) = all_var_names
    data_list$stringsAsFactors = FALSE

    res = do.call("data.frame", data_list)

    if(changeNames){
      all_var_names = rename_hat(all_var_names)
    }

    names(res) = all_var_names
  }

  res
}

combine_clusters_fast = function(...){
  # This functions creates a new cluster from several clusters
  # basically: paste(cluster1, cluster2, ... etc, sep = "__")
  # but it tries to be faster than that because paste is very slow on large datasets

  cluster = list(...)
  Q = length(cluster)

  # The problem is that the clusters can be of ANY type...
  # So too much of a pain to take care of them in c++
  # => when I have time I'll do it, but pfff...

  # Not super efficient, but that's life
  ANY_NA = FALSE
  if(any(who_NA <- sapply(cluster, anyNA))){
    ANY_NA = TRUE
    who_NA = which(who_NA)
    IS_NA = is.na(cluster[[who_NA[1]]])
    for(i in who_NA[-1]){
      IS_NA = IS_NA | is.na(cluster[[i]])
    }

    # Nice error message comes later
    if(all(IS_NA)) return(rep(NA, length(IS_NA)))

    # we recreate the clusters
    for(i in 1:Q){
      cluster[[i]] = cluster[[i]][!IS_NA]
    }
  }

  # First we unclass
  for(i in 1:Q){
    cluster[[i]] = quickUnclassFactor(cluster[[i]])
  }

  # Then we combine
  power = floor(1 + log10(sapply(cluster, max)))

  if(sum(power) > 14){
    order_index = do.call(order, cluster)
    index = cpp_combine_clusters(cluster, order_index)
  } else {
    # quicker, but limited by the precision of doubles
    index = cluster[[1]]
    for(q in 2:Q){
      index = index + cluster[[q]]*10**sum(power[1:(q-1)])
    }
  }

  if(ANY_NA){
    # we recreate the return vector with appropriate NAs
    res = rep(NA_real_, length(IS_NA))
    res[!IS_NA] = index
  } else {
    res = index
  }

  return(res)
}

combine_clusters = function(...){
  # This functions creates a new cluster from several clusters
  # basically: paste(cluster1, cluster2, ... etc, sep = "_")
  # No, it's mow much more fatser than that!!!! Thanks for my new algo
  # => the paste only applies to the unique number of items


  clusters = to_integer(..., add_items = TRUE, items.list = TRUE, internal = TRUE)

  res = clusters$items[clusters$x]

  return(res)
}

listDefault = function(x, variable, value){
  # This function puts 'value' into the element 'variable' of list 'x'
  # IF it does not already exists in 'x'

  x_name = deparse(substitute(x))

  if(is.null(x[[variable]])){
    x[[variable]] = value

    assign(x_name, x, envir = parent.frame(n = 1))
  }

}

hgrid = function(lty = 3, col = "darkgray", ymin = -Inf, ymax = Inf, ...){
  # simple function that draws an horizontal grid

  # Finding the coordinates
  y = axis(2, lwd=0, labels = NA)

  y = y[y > ymin & y < ymax]

  # now drawing the lines
  if(length(y) > 0){
    abline(h = y, col = col, lty = lty, ...)
  }
}

vgrid = function(lty = 3, col = "darkgray", xmin = -Inf, xmax = Inf, ...){
  # simple function that draws an horizontal grid

  # Finding the coordinates
  x = axis(1, lwd=0, labels = NA)

  x = x[x > xmin & x < xmax]

  # now drawing the lines
  if(length(x) > 0){
    abline(v = x, col = col, lty = lty, ...)
  }
}

shade_area = function(y1, y2, x, xmin, xmax, col="grey", ...){
  # fonction plus pratique que polygon
  # elle permet de griser une partie delimitee par
  # y1 et y2 pour chacune des valeurs de x
  # on doit avoir la meme longueur de y1,y2 et x
  # exemple:
  # a=curve(x**2,-5,5)
  # shade_area(a$y+1,a$y-1,a$x)
  # qqes parametres graphiques:
  # lwd / border (couleur du bord, peut etre NA) / lty

  n = length(x)
  stopifnot(length(y1)==n | length(y1)==1)
  stopifnot(length(y2)==n | length(y2)==1)

  if(length(y1)==1) y1 = rep(y1,n)
  if(length(y2)==1) y2 = rep(y2,n)

  if(missing(xmin)) xmin = min(x)
  if(missing(xmax)) xmax = max(x)

  ind = which(x>=xmin & x<=xmax)
  x1 = x[ind] ; x2 = x[rev(ind)]
  polygon(c(x1,x2), c(y1[ind], y2[rev(ind)]), col=col, ...)
}


check_set_nthreads = function(nthreads){
  # Simple function that checks that the nber of threads is valid
  set_up(1)

  check_value(nthreads, "integer scalar GE{0} | numeric scalar GT{0} LT{1}", .message = paste0("The argument `nthreads` must be an integer lower or equal to the number of threads available (", max(cpp_get_nb_threads(), 1), "). It can be equal to 0 which means all threads. Alternatively, if equal to a number strictly between 0 and 1, it represents the fraction of all threads to be used."))

  max_threads = cpp_get_nb_threads()

  # # To add later
  # if(cpp_is_in_fork()) return(1)

  if(nthreads == 0){
    nthreads = max(max_threads, 1)

  } else if(nthreads < 1){
    nthreads = max(ceiling(max_threads * nthreads), 1)

  } else if(nthreads > 1){
    if(max_threads == 0){
      warn_up("OpenMP not detected: cannot use ", nthreads, 
              " threads, single-threaded mode instead.")
      nthreads = 1
    } else if(nthreads > max_threads){
      warn_up("Asked for ", nthreads, " threads while the maximum is ", max_threads, 
              ". Set to ", max_threads, " threads instead.")
      nthreads = max_threads
    }

  }

  nthreads
}

clean_interact_names = function(x){
  # GOD, why is it so complicated? Why?
  # Why do I have to spend so much time on that crap?
  #
  # x = c("i(Month, keep = 4:7)__CLEAN__Month::5", "i(Month):Wind__CLEAN__Month::5", "Temp:i(bonjour(test), drop = 5:12):Wind__CLEAN__bonjour::5", "i(a, pi(b))__CLEAN__a::o")
  #
  # Speed consideration:
  # for 2000 names: 10ms simple case
  #                 28ms complex case
  # price to pay is OK
  #

  res = x

  who2clean = grepl("__CLEAN__", x, fixed = TRUE)

  x2clean = x[who2clean]

  x_split = strsplit(x2clean, "(^|(?<=[^[:alnum:]\\._]))(i|sunab(_att)?)\\(|__CLEAN__", perl = TRUE)

  x_left = sapply(x_split, function(v) v[1])
  x_right = sapply(x_split, function(v) v[2])
  x_alias = sapply(x_split, function(v) v[3])

  x_right = gsub("^[^\\(\\)]+(\\(|\\))", "\\1", x_right)

  qui_ok = substr(x_right, 1, 1) == ")"

  x_right[qui_ok] = substr(x_right[qui_ok], 2, 500)

  if(any(!qui_ok)){

    clean_x_right_hard = function(letter_vec){
      open = 1 + cumsum(letter_vec == "(")
      close = cumsum(letter_vec == ")")
      letter_vec = letter_vec[-(1:which.max(close - open == 0))]
      paste(letter_vec, collapse = "")
    }

    x_right[!qui_ok] = sapply(strsplit(x_right[!qui_ok], ""), clean_x_right_hard)
  }

  res[who2clean] = paste0(x_left, x_alias, x_right)

  return(res)
}

is_naked_fun = function(x, fun_pattern){
  # Why is it always so complicated... There must be an easier way
  # x = c("i(x1)", "i(I(x3))", "i(x3, x4, TRUE, drop = c(1, 3:5))", "Temp:i(x2)", "i(x3):Wind")

  x_split = strsplit(x, paste0("(^|(?<=[^[:alnum:]\\._]))", fun_pattern, "\\("), perl = TRUE)

  left = sapply(x_split, function(x) x[1])
  right = sapply(x_split, function(x) x[2])

  right_naked = function(r){

    if(!grepl("(", r, fixed = TRUE) && grepl("\\)$", r)){
      return(TRUE)
    }

    letter_vec = strsplit(r, "")[[1]]
    open = 1 + cumsum(letter_vec == "(")
    close = cumsum(letter_vec == ")")
    which.max(close - open == 0) == length(letter_vec)
  }

  left_ok  = nchar(left) == 0
  right_ok = rep(FALSE, length(right))
  if(any(left_ok)){
    right_ok[left_ok] = sapply(right[left_ok], right_naked)
  }

  left_ok & right_ok
}

set_defaults = function(opts_name){

  opts = getOption(opts_name)
  if(is.null(opts) || length(opts) == 0){
    return(NULL)
  }

  sysOrigin = sys.parent()
  mc = match.call(definition = sys.function(sysOrigin), 
                  call = sys.call(sysOrigin), expand.dots = FALSE)
  args_in = names(mc)

  args2set = setdiff(names(opts), args_in)

  for(v in args2set){
    assign(v, opts[[v]], parent.frame())
  }

}

fetch_data = function(x, prefix = "", suffix = ""){
  # x: fixest estimation
  # We try different strategies:
  # 0) the data has been saved at estimation time with `data.save = TRUE`
  # 1) using the environment where the estimation was done
  # 2) the "parent.frame()" defined as the frame on top of ALL fixest functions
  # 3) the global environment, if it wasn't in 1)

  # Maybe I should keep only 1) => is there a reason to add the others?

  # 0) and 1) safest
  # 2) less safe but OK => note ???
  # 3) kind of dangerous => warning() ???
  
  # 0) Data is within the fixest object
  
  if(!is.null(x$data)){
    return(x$data)
  }

  if(is.null(x$call$data)) return(NULL)

  # 1) Environment of the call

  data = NULL
  try(data <- eval(x$call$data, x$call_env), silent = TRUE)

  if(!is.null(data)){
    return(data)
  }

  # 2) First non fixest frame

  fixest_funs = ls(getNamespace("fixest"))

  i = 2
  sysOrigin = sys.parent(i)
  while(sysOrigin != 0 && as.character(sys.call(sysOrigin)[[1]]) %in% fixest_funs){
    i = i + 1
    sysOrigin = sys.parent(i)
  }

  if(sysOrigin != 0){
    # We try again...
    try(data <- eval(x$call$data, parent.frame(sysOrigin)), silent = TRUE)

    if(!is.null(data)){
      return(data)
    }
  }

  # 3) Global environment

  if(!identical(parent.env(x$call_env), .GlobalEnv)){
    # ...and again
    try(data <- eval(x$call$data, .GlobalEnv), silent = TRUE)

    if(!is.null(data)){
      return(data)
    }
  }

  # => Error message
  
  begin = sma(prefix, " we", .last = "ws, upper.sentence")

  if(nchar(suffix) > 0){
    suffix = gsub("^ +", "", suffix)
  }

  data_name = charShorten(deparse(x$call$data)[1], 15)
  stop_up(begin, " fetch the data in the enviroment where the estimation was made, but the data does not seem to be there any more (btw it was {bq ? data_name}). ", suffix)


}

is_large_object = function(x){

  if(is.list(x)){
    if(length(x[[1]]) > 10000){
      return(TRUE)
    } else {
      return(FALSE)
    }
  } else if(length(x)[1] > 10000){
    return(TRUE)
  }

  FALSE
}

build_flags = function(mc, ..., call_env){
  # Returns a list of arguments
  # If the arguments are too large => we use calls instead and save the calling environment
  # Note that the arguments that will be passed in ... must NOT be missing // they must be initialized to NULL
  # All that stuff just to avoid too large objects....

  dots = list(...)

  args = names(dots)

  res = list()
  for(i in seq_along(dots)){

    x = dots[[i]]
    if(is.null(x)){
      # nothing // NULL arguments are still in 'dots', so we want to avoid them
    } else if(is_large_object(x)){
      res[[args[i]]] = mc[[args[i]]]
      if(is.null(res$call_env)){
        if(missing(call_env)){
          res$call_env = new.env(parent = parent.frame(2))
        } else {
          res$call_env = call_env
        }
      }
    } else {
      res[[args[i]]] = x
    }
  }

  res
}

assign_flags = function(flags, ...){
  # flags == arguments
  # We get the arguments used when the function was originally called
  # we assign these arguments to the previous frame

  dots = list(...)

  res = list()

  for(arg in setdiff(names(flags), "call_env")){

    if(is.null(dots[[arg]])){
      # we only assign values that were not provided by the user
      x = flags[[arg]]
      if(is.name(x) || is.call(x)){
        x = eval(x, flags$call_env)
      }
      assign(arg, x, parent.frame())
    }
  }
}

items_to_drop = function(items, x, varname, keep = FALSE, argname,
                         keep_first = FALSE, up = 1, no_error = FALSE,
                         valid_ref = FALSE){
  # selection of items
  # the selection depends on the type of x
  # always returns the IDs of the items to drop

  set_up(up)

  if(missing(argname)){
    argname = deparse(substitute(x))
  }

  ref = argname == "ref" && !valid_ref

  if(keep_first && (keep || ref)){
    stop("Internal error: keep_first should not be used with `keep` or `ref`.")
  }

  if(is.factor(items)){
    items = as.character(items)
  }

  if(is.factor(x)){
    x = as.character(x)
  }


  if(inherits(x, "formula")){

    if(!identical(all.vars(x), "x")){
      stop_up("In argument `", argname, "`, the formula must contain a single variable name: `x`. So far `", deparse_long(x), "` is not valid.")
    }

    if(length(x) > 2){
      stop_up("In argument `", argname, "`, if a formula, it must be one-sided. Problem: `",
              deparse_long(x), "` is two-sided.")
    }

    is_here = error_sender(eval(x[[2]], list(x = items)),
                           "In argument `", argname, 
                           "`, the evaluation of the formula led to an error:")
    if(length(is_here) != length(items)){
      stop_up("In argument `", argname, "`, the evaluation of the formula must return a logical vector of the same length as `x`. Problem: `", deparse_long(x), "` returns a vector of length ", length(is_here), " (expected: ", length(items), ").")
    }

    if(!is.logical(is_here)){
      stop_up("In argument `", argname, "`, the evaluation of the formula must return a logical vector. Problem: `", deparse_long(x), "` is not logical (instead it is of class ", enumerate_items(class(is_here)), ").")
    }

    is_here = !is.na(is_here) & is_here

    if(!no_error && !any(is_here)){
      stop_up("In argument `", argname, "`, the evaluation of the formula must match at least one value. Problem: `", deparse_long(x), "` does not match any.")
    }

    if(keep){
      id_drop = which(!is_here)
    } else {
      id_drop = which(is_here)
    }

  } else if(is.character(x)){
    all_x = c()
    for(i in seq_along(x)){

      my_x = x[i]
      if(grepl("^@", my_x)){
        # A) regex
        pattern = substr(my_x, 2, nchar(my_x))
        new_x = grep(pattern, items, value = TRUE)
        if(length(new_x) == 0 && !no_error){
          # strong checking!
          stop_up("In argument `", argname, "`, the regular expression `", pattern, "` does not match any value of `", varname, "`.")
        }
        all_x = c(all_x, new_x)
      } else {
        # B) partial matching
        if(no_error){
          qui_ok = pmatch(tolower(my_x), tolower(items), duplicates.ok = TRUE)
          my_x = items[qui_ok]
          my_x = my_x[!is.na(my_x)]
        } else {
          check_set_value(my_x, "match", .choices = unique(items), .message = paste0("The argument `", argname, "` should contain values of the variable `", varname, "`."))
        }

        all_x = c(all_x, my_x)
      }

      if(keep_first && i == 1){
        all_x_first = all_x
      }

    }

    if(ref){
      if(length(all_x) == 1){
        id_drop = which(items %in% all_x)
      } else {
        id_drop = c(which(items %in% all_x[1]), which(items %in% all_x[-1]))
      }
    } else if(keep){
      id_drop = which(!items %in% all_x)
    } else {
      id_drop = which(items %in% all_x)

      if(keep_first){
        id_drop = unique(c(which(items %in% all_x_first), id_drop))
      }
    }

  } else {
    # exact matching
    if(keep){
      id_drop = which(!items %in% x)
    } else {

      if(ref){
        if(length(x) == 1){
          id_drop = which(items %in% x)
        } else {
          id_drop = c(which(items %in% x[1]), which(items %in% x[-1]))
        }
      } else {
        id_drop = which(items %in% x)

        if(keep_first){

          if(!x[1] %in% items && !no_error){
            stop_up("In argument `", argname, "`, the value ", x[1], " does not match any value of `", varname, "`.")
          }

          id_drop = unique(c(which(items %in% x[1]), id_drop))
        }

      }

      if(length(id_drop) == 0 && !no_error){
        stop_up("In argument `", argname, "`, the value", plural_len(x, "s.don't"), " match any value of `", varname, "`.")
      }
    }

  }

  id_drop
}



bin_factor = function(bin, x, varname, no_error = FALSE){
  # x = base_did$period
  # bin = list("9" = c(9, 2, 3), "7" = "@7|8", 5:6)
  # varname = "period" ; argname = "bin"

  set_up(1)

  argname = deparse(substitute(bin))
  check_arg(bin, "list | vector | os formula")

  #
  # DSB expansion
  #

  bin_dp = deparse_long(bin)
  if(grepl(".[", bin_dp, fixed = TRUE)){

    bin_names = names(bin)
    if(!is.null(bin_names)){
      qui = which(grepl(".[", bin_names, fixed = TRUE))
      for(i in qui){
        bin_names[i] = error_sender(dsb(bin_names[i], frame = parent.frame(2), 
                                        nest = FALSE, vectorize = TRUE, collapse = ""),
                                    sma("Error when binning: the name ({bin_names[i]}) expanded",
                                        " with `.[]` led to an error:"))

      }
    }
    names(bin) = bin_names

    qui = which(sapply(bin, function(x) is.character(x) && any(grepl(".[", x, fixed = TRUE))))
    for(i in qui){
      value = as.list(bin[[i]])
      qui_bis = which(grepl(".[", value, fixed = TRUE))
      for(j in qui_bis){
        value[[j]] = error_sender(.dsb(value[[j]], frame = parent.frame(2), nest = FALSE),
                                  sma("Error when binning: the name ({ value[[j]] }) expanded",
                                      " with `.[]` led to an error:"))
      }
      bin[[i]] = unlist(value)
    }

  }

  #
  # cut:: => special treatment, we short circuit
  #

  if(!is.list(bin) && (is.character(bin) && any(grepl("^cut::", bin)))){

    if(!grepl("^cut::", bin[1])){
      stop_up("To use the special binning `cut::values`, it must be in the first position of the vector, possibly followed by custom names. Currently `cut::values` is in position ", which(grepl("^cut::", bin))[1], ".")
    }

    n_names = length(bin) - 1
    if(n_names > 1){
      my_cut = trimws(sub("^cut::", "", bin[1]))
      if(grepl("^[[:digit:]]+$", my_cut)){
        n_cuts = as.numeric(my_cut)
      } else {
        n_cuts = length(strsplit(my_cut, "\\[|\\]")[[1]]) + 1
      }

      if(n_names > n_cuts){
        stop_up("In the special binning `cut::values`, the number of custom names is greater than the number of bins (", n_names, " vs ", n_cuts, ").")
      }
    }

    if(!is.numeric(x)){
      stop_up("To use the special binning `cut::values`, the variable {bq?varname} must be numeric. Currently this is not the case (it is of class {enum.q ? class(x)} instead).")
    }

    return(cut_vector(x, bin))
  }

  x_int = to_integer(x, add_items = TRUE, items.list = TRUE, sorted = TRUE)
  x_items = x_int$items

  do_factor = FALSE
  if(is.factor(x_items)){
    do_factor = TRUE
    x_items = as.character(x_items)
  }

  if(!is.list(bin)){
    if(is.character(bin) && any(grepl("^!?!?bin::", bin))){

      if(length(bin) > 1){
        stop_up("To use the special binning `bin::digit`, the argument {bq?argname} must be of length 1. Currently it is of length {len?bin}.")
      }

      d = gsub("^!?!?bin::", "", bin)
      if(any(grepl("[^[:digit:]]", d))){
        bin_type = gsub("^(!?!?bin).*", "\\1", bin)
        stop_up("In the argument bin, the special binning must be of the form `{bin_type}::digit`. Currently this is not the case for `{bin_type}::{d}.")
      }
      d = as.numeric(d)

      consecutive = grepl("^!", bin)
      from_last = grepl("^!!", bin)

      if(!consecutive){
        if(!is.numeric(x_items)){
          stop_up("To use the special binning `bin::digit`, the variable {bq?varname} must be numeric. Currently this is not the case (it is of class {enum.q?class(x_items)} instead).")
        }

        new_x = (x_items %/% d) * d
        new_x_unik = unique(new_x)

      } else {
        n = length(x_items)
        x_seq = if(from_last) n:1 else 1:n
        new_x = ((x_seq - 1) %/% d) * d + 1
        new_x_unik = unique(new_x)
      }

      bin = list()
      for(i in seq_along(new_x_unik)){
        bin[[i]] = x_items[new_x == new_x_unik[i]]
      }

    } else {
      bin = list(bin)
    }
  }

  # we catch "@" as first item
  if(identical(bin[[1]], "@") && length(bin) > 1){
    # this means that the user wants to specify
    # the binned values as first elements of the new factor
    bin[[1]] = NULL
    if(is.null(names(bin))){
      names(bin) = paste0("@", seq_along(bin), " ", names(bin))
    } else {
      qui_ok = !grepl("^@\\d+", names(bin))
      names(bin)[qui_ok] = paste0("@", seq(sum(qui_ok)), " ", names(bin)[qui_ok])
    }
  }

  x_map = x_range = seq_along(x_int$items)
  id_bin = list()
  for(i in seq_along(bin)){
    id_bin[[i]] = items_to_drop(x_items, bin[[i]], varname, up = 2, argname = argname,
                  keep_first = TRUE, no_error = no_error, valid_ref = TRUE)
  }

  # sanity check
  if(any(table(unlist(id_bin)) > 1)){
    t_id = table(unlist(id_bin))
    n_max = max(t_id)
    pblm = x_items[as.numeric(names(t_id)[t_id == n_max])]
    stop_up("In `bin`, some values are binned in different bins, it's of course not allowed. The value `", pblm, "` is in ", n_max, " bins.")
  }

  # recreating the factor

  # case no_error = TRUE
  if(any(lengths(id_bin) == 0)){
    qui_ok = lengths(id_bin) > 0
    id_bin = id_bin[qui_ok]
    if(length(id_bin) == 0) return(x)
    bin = bin[qui_ok]
  }

  id_not_core = FALSE
  for(i in seq_along(bin)){
    id_bin_i = id_bin[[i]]
    id_core = id_bin_i[1]

    id_change = x_range %in% setdiff(id_bin_i, id_core)
    id_not_core = id_not_core | id_change

    x_map = x_map - cumsum(id_change)
  }

  # final pass
  for(i in seq_along(bin)){
    id_bin_i = id_bin[[i]]
    id_core = id_bin_i[1]
    id_change = x_range %in% id_bin_i

    x_map[id_change] = x_map[id_core]
  }

  # changing the item values
  x_items_new = x_items

  bin_names = names(bin)
  custom_loc = FALSE # custom location
  if(is.null(bin_names)){
    bin_names = character(length(bin))

  } else if(any(grepl("^@\\d", bin_names))){
    custom_loc = TRUE
    do_factor = TRUE
    x_items_new = as.character(x_items_new)

    qui = grepl("^@\\d", bin_names)
    bin_location = sub("^@(\\d+).*", "\\1", bin_names)
    bin_location[!qui] = NA
    bin_location = as.numeric(bin_location)

    bin_names = sub("^@\\d+ *", "", bin_names)
  }

  for(i in seq_along(bin)){

    b_name = bin_names[[i]]
    if(nchar(b_name) == 0){
      b_name = as.character(x_items_new[id_bin[[i]][1]])
      bin_names[[i]] = b_name # useful for new_loc
    }

    if(is.numeric(x_items_new)){
      b_name_num = tryCatch(as.numeric(b_name), warning = function(x) "not numeric")
      if(identical(b_name_num, "not numeric")){
        # we convert to character
        do_factor = TRUE
        x_items_new = as.character(x_items_new)
        b_name_num = b_name
      }

      x_items_new[id_bin[[i]]] = b_name_num
    } else {
      x_items_new[id_bin[[i]]] = b_name
    }
  }

  x_items_new = x_items_new[!id_not_core]

  if(custom_loc){
    n_bins = length(bin_names)
    new_loc = rep(NA_real_, length(x_items_new))
    for(i in 1:n_bins){
      b_name = bin_names[i]
      new_loc[x_items_new == b_name] = bin_location[i]
    }

    loc_no_na = new_loc[!is.na(new_loc)]
    while(anyDuplicated(loc_no_na)){
      # regularization
      is_dup = duplicated(loc_no_na)
      value = min(loc_no_na[is_dup])
      i_first = which.max(loc_no_na == value)
      loc_no_na[loc_no_na >= value] = loc_no_na[loc_no_na >= value] + 1
      loc_no_na[i_first] = value
    }

    n_max = length(x_items_new)
    if(any(loc_no_na > n_max)){
      # regularization

      n_lnona = length(loc_no_na)
      if(n_lnona == 1){
        loc_no_na = n_max
      } else {
        order_loc_no_na = order(loc_no_na)

        loc_sorted = loc_no_na[order_loc_no_na]
        loc_sorted[n_lnona] = n_max
        for(i in n_lnona:2) {
          if(loc_sorted[i] <= loc_sorted[i - 1]){
            loc_sorted[i - 1] = loc_sorted[i] - 1
          }
        }

        loc_no_na = loc_sorted[order(order_loc_no_na)]
      }
    }

    new_loc[!is.na(new_loc)] = loc_no_na

    for(i in seq_along(new_loc)){
      if(!i %in% new_loc){
        j = which(is.na(new_loc))[1]
        new_loc[j] = i
      }
    }

    x_map = new_loc[x_map]
    x_items_new = x_items_new[order(new_loc)]
  }

  if(do_factor){
    x_items_new = factor(x_items_new, levels = x_items_new)
  }

  x_new = x_items_new[x_map[x_int$x]]

  return(x_new)
}


cut_vector = function(x, bin){
  # We cut a vector into pieces
  # **only numeric vectors**
  # cut::a]b[c] cuts the vectors into four slices: [-Inf, a]; ]a, b[; [b, c]; ]c, Inf]
  # a, b, c should be numeric values
  # they can be replaced with: pXX or qX, percentiles and quartiles
  # cut::n splits the data into n equal sizes

  set_up(2)

  bin_names_usr = bin[-1]
  bin = bin[1]

  # checking bin
  if(!is.character(bin) || !grepl("^cut::", bin)){
    stop("Internal bug: Argument `bin` should be equal to `cut::stg` over here -- this is not the case.")
  }

  # cleaning x
  n = length(x)
  ANY_NA = anyNA(x)
  IS_NA = FALSE
  if(ANY_NA){
    IS_NA = is.na(x)

    if(all(IS_NA)){
      return(x)
    }

    x = x[!IS_NA]
  }

  # we sort x (needed later)
  x_order = order(x)
  x_sorted = x[x_order]

  my_cut = gsub("^cut::", "", bin)

  if(is_numeric_in_char(my_cut)){
    my_cut = as.numeric(my_cut)
    pct = cumsum(rep(1/my_cut, max(my_cut - 1, 1)))
    cut_points = quantile(x_sorted, pct)

    bounds = rep("]", length(cut_points))

  } else {
    bounds = regmatches(my_cut, gregexpr("\\[|\\]", my_cut))[[1]]
    values = strsplit(my_cut, "\\[|\\]")[[1]]

    n_cuts = length(values)

    if(n_cuts != length(bounds)){
      stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles/percentiles. Problem: each number must be followed by an open (or closed) square bracket, this is currentlly not the case.")
    }

    cut_points = numeric(n_cuts)
    for(i in seq_along(values)){

      v = trimws(values[i])

      if(is_numeric_in_char(v)){
        cut_points[i] = as.numeric(v)

      } else if(grepl("^p|P|q|Q", v)){
        p = gsub("^.", "", v)
        if(!is_numeric_in_char(p)){
          stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles (resp. percentiles) of the form qX (resp. pX) with X a number. \n  The value `", v, "`, in `", bin, "`, is incorrect.")
        }

        p = as.numeric(p)


        if(grepl("^q|Q", v)){
          # we transform the quartile into a percentile
          if(!p %in% c(0:4)){
            stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles (resp. percentiles) of the form qX (resp. pX) with X a number. \n  The value `", v, "`, in `", bin, "`, is incorrect. \n  The quartile must be an integer between 0 and 4.")
          }

          p = c(0, 25, 50, 75, 100)[p + 1]
        }

        if(!p %in% 0:100){
          stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles (resp. percentiles) of the form qX (resp. pX) with X a number. \n  The value `", v, "`, in `", bin, "`, is incorrect. \n  The percentile must be an integer between 0 and 100.")
        }

        cut_points[i] = quantile(x, p/100)

      } else {
        stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles (resp. percentiles) of the form qX (resp. pX) with X a number. \n  The value `", v, "`, in `", bin, "`, is incorrect. This is not a percentile nor a number.")
      }
    }

    if(n_cuts > 1 && any(cut_points[1:(n_cuts-1)] > cut_points[1 + 1:(n_cuts-1)])){
      i_pblm = which(cut_points[1:(n_cuts-1)] > cut_points[1 + 1:(n_cuts-1)])[1]
      stop_up("In `bin`, the format should be `cut::a]b]` with `a`, `b`, etc, numbers or quartiles (resp. percentiles) of the form qX (resp. pX) with X a number. \n  The values `a`, `b`, etc should be increasing, but the ", n_th(i_pblm), " value (", cut_points[i_pblm], ") is larger than the ", n_th(i_pblm + 1), " (", cut_points[i_pblm + 1], ").")
    }

  }

  bin_names = character(length(cut_points) + 1)
  for(i in seq_along(bin_names_usr)){
    bi = bin_names_usr[i]
    if(!is.na(bi)){
      bin_names[i] = bi
    }
  }

  is_included = 1 * (bounds == "]")
  x_cut = cpp_cut(x_sorted, cut_points, is_included)

  x_int = x_cut$x_int
  isnt_empty = x_cut$isnt_empty
  value_min = x_cut$value_min
  value_max = x_cut$value_max
  is_int = x_cut$is_int

  n_bins = sum(isnt_empty)

  if(any(isnt_empty == 0)){
    i_empty = which(isnt_empty == 0)
    if(getFixest_notes()){
      message("When binning: in `", bin, "`, the ", enumerate_items(n_th(i_empty)), " bin", plural_len(i_empty, "s.is"), " empty.")
    }

    x_int = cumsum(isnt_empty)[x_int]
    value_min = value_min[-i_empty]
    value_max = value_max[-i_empty]
    bin_names = bin_names[-i_empty]
  }

  # creating the labels
  labels = character(n_bins)
  if(is_int){
    # format A-B

    # we don't want to add a comma to the years! But to large numbers: yes
    fmt_fun = if(value_max[n_bins] > 9999) dreamerr::fsignif else function(x) as.character(x)

    for(i in 1:n_bins){
      if(value_min[i] == value_max[i]){
        labels[i] = fmt_fun(value_min[i])
      } else {
        labels[i] = paste0(fmt_fun(value_min[i]), "-", fmt_fun(value_max[i]))
      }
    }

  } else {
    # format [A, B]
    # we always write in inclusion, too hard to do the exclusion mentally
    d_min = NULL
    for(i in 1:n_bins){
      vmin = value_min[i]
      vmax = value_max[i]

      l10_diff = if(vmax == vmin) 0 else log10(vmax - vmin)
      # d: nber of digits
      d = if(l10_diff >= 0) 1 else ceiling(abs(l10_diff))
      if(!is.null(d_min) && d < d_min) d = d_min

      # we check consistency problems
      d_min = NULL
      vmax_fmt = sprintf("%.*f", d, vmax)
      if(i != n_bins){
        vmin_next_fmt = sprintf("%.*f", d, value_min[i + 1])
        if(vmax_fmt == vmin_next_fmt){
          l10_diff = log10(value_min[i + 1] - vmax)
          d = ceiling(abs(l10_diff))
          d_min = d
        }
        vmax_fmt = cpp_add_commas(vmax, d, FALSE)
      }
      vmin_fmt = cpp_add_commas(vmin, d, FALSE)

      labels[i] = paste0("[", vmin_fmt, "; ", vmax_fmt, "]")
    }
  }

  for(i in seq_along(bin_names)){
    bi = bin_names[i]
    if(nchar(bi) > 0){
      labels[i] = bi
    }
  }


  # Te result
  if(ANY_NA){
    res = rep(NA_real_, n)
    res[!IS_NA] = x_int[order(x_order)]
  } else {
    res = x_int[order(x_order)]
  }

  res = factor(res, labels = labels)

  return(res)
}


fixest_pvalue = function(x, zvalue, vcov){
  # compute the pvalue for a fixest estimation

  if(use_t_distr(x)){

    if(missing(vcov)) {
      stop("Internal error (=bug): the argument `vcov` should not be missing in fixest_pvalue().")

    } else if(is.null(attr(vcov, "dof.K"))){
      stop("Internal error (=bug): the attribute `dof.K` from `vcov` should not be NULL.")
    }

    # df.t is always an attribute of the vcov
    df.t = attr(vcov, "df.t")
    if(is.null(df.t)){
      df.t = max(nobs(x) - attr(vcov, "dof.K"), 1)
    }

    pvalue = 2*pt(-abs(zvalue), df.t)

  } else {
    pvalue = 2*pnorm(-abs(zvalue))
  }

  pvalue
}

fixest_CI_factor = function(x, level, vcov = NULL, df.t = NULL){

  val = (1 - level) / 2
  val = c(val, 1 - val)

  if(use_t_distr(x)){

    if(missing(vcov) && missing(df.t)){
      stop("Internal error (=bug): the arguments `vcov` and `df.t` ",
         "should not be both missing in fixest_CI_factor().")

    }

    if(!missing(df.t)){
      if(!is.numeric(df.t) && !length(df.t) == 1){
        stop("Internal error (=bug): the arguments `df.t` ",
           "should be numeric in fixest_CI_factor().")
      }
    } else {
      if(is.null(attr(vcov, "dof.K"))){
        stop("Internal error (=bug): the attribute `dof.K` from `vcov` should not be NULL.")
      }

      # df.t is always an attribute of the vcov
      df.t = attr(vcov, "df.t")
      if(is.null(df.t)){
        df.t = max(nobs(x) - attr(vcov,"dof.K"), 1)
      }
    }

    fact = qt(val, df.t)

  } else {
    fact = qnorm(val)
  }

  fact
}


#### --------------- ####
#### Small Utilities ####
####


#
# stringmagic ####
#

sma = string_magic_alias(.check = FALSE)

catma = cat_magic_alias(.check = FALSE)

mema = message_magic_alias(.check = FALSE, .last = "'min(100, .sw)'width")

stvec = stringmagic::string_vec_alias()


escape_regex = function(x){
  # escape special characters in regular expressions => to make it as "fixed"

  res = gsub("((?<=[^\\\\])|(?<=^))(\\$|\\.|\\+|\\(|\\)|\\[|\\]|\\?|\\^)", "\\\\\\2", x, 
             perl = TRUE)
  res
}

.addCommas = function(x){

  if(!is.finite(x)) return(as.character(x))

  cpp_add_commas(x)
}

addCommas = function(x){
  sapply(x, .addCommas)
}

decimalFormat = function(x){

  who_valid = which(!is.na(x) & is.numeric(x))
  if(length(who_valid) == 0) return(x)

  res = x

  x_valid = x[who_valid]
  xPower = log10(abs(x_valid))

  if(min(xPower) > 0){
    pow_round = max(1, 6 - ceiling(xPower))
  } else if(min(xPower) < -5){
    pow_round = ceiling(abs(min(xPower))) + 2
  } else {
    pow_round = 6
  }

  res[who_valid] = round(x_valid, pow_round)

  res
}

numberFormat_single = function(x, type = "normal"){
  # For numbers higher than 1e9 => we apply a specific formatting
  # idem for numbers lower than 1e-4

  if(is.character(x)) return(x)

  if(is.na(x)){
    return(NA)
  }

  if(x == 0) return("0")

  exponent = floor(log10(abs(x)))

  if(-4 < exponent && exponent < 9){
    if(exponent > 0){
      return(addCommas(x))
    } else {
      return(as.character(decimalFormat(x)))
    }

  }

  left_value = round(x*10**-exponent, 3)

  if(type == "latex"){
    res = paste0("$", left_value, "\\times 10^{", exponent, "}$")
  } else {
    res = paste0(left_value, "e", ifelse(exponent > 0, "+", ""), exponent)
  }

  res
}

numberFormatLatex = function(x){
  sapply(x, numberFormat_single, type = "latex")
}

numberFormatNormal = function(x){
  sapply(x, numberFormat_single)
}

mysignif = function (x, d = 2, r = 1){

  .mysignif = function(x, d, r) {
    if (is.na(x)) {
      return(NA)
    }

    if(abs(x) >= 10^(d - 1)){
      return(round(x, r))
    } else {
      return(signif(x, d))
    }
  }
  sapply(x, .mysignif, d = d, r = r)
}

format_nber_single = function(x, digits, round = FALSE, pow_above = 10, pow_below = -5, 
                              tex = FALSE){
  # Attention aux nombres ronds => pas chiffre apres la virgule!

  if(is.na(x) || !is.numeric(x)){
    return(x)
  }

  if(round){
    # super simple
    return(cpp_add_commas(x, digits))

  } else {
    whole = (x %% 1) == 0
    if(abs(x) >= 10^(digits - 1)){
      x = round(x, 1)
    } else {
      x = signif(x, digits)
    }
  }

  exponent = if(x == 0) -16 else floor(log10(abs(x)))

  if(exponent >= pow_above || exponent <= pow_below){
    left_value = round(x*10**-exponent, min(digits, 2))

    if(tex){
      res = paste0("$", left_value, "\\times 10^{", exponent, "}$")
    } else {
      res = paste0(left_value, "e", ifelse(exponent > 0, "+", ""), exponent)
    }

  } else if(exponent >= 0){
    r = max(-(exponent + 1 - digits), 1)
    res = cpp_add_commas(x, r, whole)

  } else if(abs(x) > 10**(-digits)){
    res = sprintf("%.*f", digits, x)

  } else {
    res = sprintf("%.*f", abs(exponent), x)
  }

  res
}

format_number = function(x, digits = 4, round = FALSE, pow_above = 10, 
                         pow_below = -5, tex = FALSE){
  sapply(x, format_nber_single, digits = digits, round = round, pow_above = pow_above, 
         pow_below = pow_below, tex = tex)
}

index_2D_to_1D = function(i, j, n_j) 1 + (i - 1) * n_j + j - 1

par_fit = function(my_par, id){
  # simple function that extends the plot parameters
  my_par = rep(my_par, ceiling(max(id) / length(my_par)))
  my_par[id]
}

dict_apply = function(x, dict = NULL){

  check_arg(dict, "NULL named character vector no na", .message = "The argument `dict` must be a dictionnary, ie a named vector (eg dict=c(\"old_name\"=\"new_name\")")

  if(missing(dict) || length(dict) == 0){
    return(x)
  }

  # We make the dictionary names space agnostic, adds a handful of us only
  if(any(grepl(" ", x, fixed = TRUE))){
    x_tmp = gsub(" ", "", x, fixed = TRUE)
  } else {
    x_tmp = x
  }

  if(any(grepl(" ", names(dict), fixed = TRUE))){
    names(dict) = gsub(" ", "", names(dict), fixed = TRUE)
    if(anyDuplicated(names(dict))){
      dup = duplicated(names(dict))
      stop("The dictionary `dict` contains several items with the same names, it concerns ",
         enumerate_items(names(dict)[dup]), " (note that spaces are ignored).")
    }
  }

  who = x_tmp %in% names(dict)
  x[who] = dict[as.character(x_tmp[who])]
  x
}

keep_apply = function(x, keep = NULL, logical = FALSE){

  if(missing(keep) || length(keep) == 0){
    if(logical){
      return(rep(TRUE, length(x)))
    } else {
      return(x)
    }
  }

  check_arg(keep, "character vector no na", 
            .message = "The arg. `keep` must be a vector of regular expressions (see help(regex)).")

  res = x

  qui_keep = rep(FALSE, length(x))
  for(var2keep in keep){
    if(grepl("^!%", var2keep)) var2keep = gsub("^!%", "%!", var2keep)

    vect2check = res
    if(grepl("^%", var2keep)){
      var2keep = gsub("^%", "", var2keep)
      vect2check = names(res)
      if(is.null(vect2check)){
        # warning("In keep, the special character `%` cannot be used here.")
        vect2check = res
      }
    }

    if(grepl("^!", var2keep)){
      qui_keep = qui_keep | !grepl(substr(var2keep, 2, nchar(var2keep)), vect2check)
    } else {
      qui_keep = qui_keep | grepl(var2keep, vect2check)
    }
  }

  if(logical) return(qui_keep)

  res[qui_keep]
}

drop_apply = function(x, drop = NULL){

  if(missing(drop) || length(drop) == 0){
    return(x)
  }

  check_arg(drop, "character vector no na", 
            .message = "The arg. `drop` must be a vector of regular expressions (see help(regex)). ")

  res = x

  for(var2drop in drop){
    if(grepl("^!%", var2drop)) var2drop = gsub("^!%", "%!", var2drop)

    vect2check = res
    if(grepl("^%", var2drop)){
      var2drop = gsub("^%", "", var2drop)
      vect2check = names(res)
      if(is.null(vect2check)){
        # warning("In drop, the special character `%` cannot be used here.")
        vect2check = res
      }
    }

    if(grepl("^!", var2drop)){
      res = res[grepl(substr(var2drop, 2, nchar(var2drop)), vect2check)]
    } else {
      res = res[!grepl(var2drop, vect2check)]
    }
  }

  res
}

order_apply = function(x, order = NULL){

  if(missing(order) || length(order) == 0){
    return(x)
  }

  check_arg(order, "character vector no na", 
            .message = "The arg. `order` must be a vector of regular expressions (see help(regex)). ")

  res = x

  for(var2order in rev(order)){
    if(grepl("^!%", var2order)) var2order = gsub("^!%", "%!", var2order)

    vect2check = res
    if(grepl("^%", var2order)){
      var2order = gsub("^%", "", var2order)
      vect2check = names(res)
      if(is.null(vect2check)){
        # warning("In order, the special character `%` cannot be used here.")
        vect2check = res
      }
    }

    if(grepl("^!", var2order)){
      who = !grepl(substr(var2order, 2, nchar(var2order)), vect2check)
      res = c(res[who], res[!who])
    } else {
      who = grepl(var2order, vect2check)
      res = c(res[who], res[!who])
    }
  }

  res
}

charShorten = function(x, width, keep.digits = FALSE){
  # transforms characters => they can't go beyond a certain width
  # two dots are added to suggest longer character
  # charShorten("bonjour", 5) => "bon.."
  n = nchar(x)

  if(n > width && width > 3){
    if(keep.digits){
      trailing_digits = sma("{'\\d+$'X ? x}")
      n_d = nchar(trailing_digits)
      if(length(n_d) == 1 && n_d > 0){
        res = sma("{`max(width - n_d - 2, 3)`k ? x}..{trailing_digits}")
        return(res)
      }
    }

    res = substr(x, 1, width - 2)
    res = paste0(res, "..")
  } else {
    res = x
  }

  res
}


show_vars_limited_width = function(charVect, nbChars = 60, addS = FALSE){
  # There are different cases

  n = length(charVect)

  if(n == 1){
    return(charVect)
  }

  text_s = ifelse(addS, "s ", "")

  nb_char_vect = nchar(charVect)
  sumChar = cumsum(nb_char_vect) + (0:(n-1))*2 + 3 + 1

  if(max(sumChar) < nbChars){
    text = paste0(text_s, paste0(charVect[-n], collapse = ", "), " and ", charVect[n])
    return(text)
  }

  qui = max(which.max(sumChar > nbChars - 8) - 1, 1)

  nb_left = n - qui

  text = paste0(text_s, paste0(charVect[1:qui], collapse = ", "), 
                " and ", nb_left, " other", ifelse(nb_left>1, "s", ""), ".")

  return(text)
}


char2num = function(x, addItem = FALSE){
  # we transform the data to numeric => faster analysis

  # special case
  qui = which(x == "")
  if(length(qui) > 0){
    x[qui] = "xxEMPTYxx"
  }

  x_unik = unique(x)
  dict = 1:length(x_unik)
  names(dict) = x_unik
  x_num = dict[x]

  names(x_num) = NULL

  if(addItem){
    res = list(x = x_num, items = x_unik)
    return(res)
  } else {
    return(x_num)
  }

}

quickUnclassFactor = function(x, addItem = FALSE, sorted = FALSE){
  # does as unclass(as.factor(x))
  # but waaaaay quicker

  not_num = !is.numeric(x)
  is_char_convert = not_num && !is.character(x)

  if(is_char_convert){
    res = cpp_quf_gnl(as.character(x))
  } else {
    res = cpp_quf_gnl(x)
  }

  if(sorted){

    if(not_num){
      items = x[res$x_unik]
    } else {
      items = res$x_unik
    }

    x = res$x_uf

    new_order = order(items)
    order_new_order = order(new_order)
    x_uf = order_new_order[x]

    if(addItem){
      if(is_char_convert){
        res = list(x = x_uf, items = as.character(items[new_order]))
      } else {
        res = list(x = x_uf, items = items[new_order])
      }

      return(res)
    } else {
      return(x_uf)
    }
  }

  if(addItem){

    if(not_num){
      if(is_char_convert){
        items = as.character(x[res$x_unik])
      } else {
        items = x[res$x_unik]
      }

      res = list(x = res$x_uf, items = items)
    } else {
      names(res) = c("x", "items")
    }

    return(res)
  } else {
    return(res$x_uf)
  }
}

missnull = function(x) missing(x) || is.null(x)

MISSNULL = function(x){
  # same a missnull but we also check evaluation

  if(missing(x)) return(TRUE)

  # we check evaluation and nullity
  error_sender(x, up = 1, arg_name = deparse(substitute(x)))

  is.null(x)
}

isScalar = function(x, int = FALSE) {
  if(length(x) == 1L && is.numeric(x) && is.finite(x)){
    if(int){
      return(x %% 1 == 0)
    } else {
      return(TRUE)
    }
  } else {
    return(FALSE)
  }
}

isLogical = function(x) length(x) == 1L && is.logical(x) && !is.na(x)

isSingleChar = function(x) length(x) == 1L && is.character(x) && !is.na(x)

fml2varnames = function(fml, combine_fun = FALSE){
  # This function transforms a one sided formula into a
  # character vector for each variable

  # In theory, I could just use terms.formula to extract the variable.
  # But I can't!!!! Because of this damn ^ argument.
  # I need to apply a trick

  # combine_fun: whether to add a call to combine_clusters_fast

  # Only the ^ users "pay the price"

  if("^" %in% all.vars(fml, functions = TRUE)){
    # new algo using fml_combine, more robust
    fml_char = as.character(fml)[2]
    all_var_names = fml_combine(fml_char, TRUE, vars = TRUE)
    if(!combine_fun){
      all_var_names = rename_hat(all_var_names)
    }

  } else {
    t = terms(fml)
    all_var_names = attr(t, "term.labels")
    # => maybe I should be more cautious below???? Sometimes `:` really means stuff like 1:5
    all_var_names = gsub(":", "*", all_var_names) # for very special cases
  }


  all_var_names
}

msg_na_inf = function(any_na, any_inf){
  if(any_na && any_inf){
    res = "NA and infinite values"
  } else if(any_na){
    res = "NA values"
  } else {
    res = "infinite values"
  }

  res
}


which_na_inf = function(x, nthreads){
  # returns the same elements as cppar_which_na_inf_mat

  is_num = sapply(x, is.numeric)

  if(all(is_num)){
    res = cpp_which_na_inf_mat(as.matrix(x), nthreads)
  } else {

    # numeric
    if(any(is_num)){
      res = cpp_which_na_inf_mat(as.matrix(x[, is_num, drop = FALSE]), nthreads)
    } else {
      res = list(any_na_inf = FALSE, any_na = FALSE, any_inf = FALSE, is_na_inf = FALSE)
    }

    # non numeric
    is_na = rowSums(is.na(x[, !is_num, drop = FALSE])) > 0

    res$any_na = res$any_na || any(is_na)
    res$is_na_inf = res$is_na_inf || res$any_na
    res$is_na_inf = res$is_na_inf | is_na
  }

  return(res)
}

extract_fe_slope = function(t){
  # input: x$fixef_terms which contains the slopes and the fixef

  fixef_vars = t[!grepl("\\[", t)]
  slope_terms = t[grepl("\\[", t)]
  slope_vars = gsub(".+\\[|\\]", "", slope_terms)
  slope_fe = gsub("\\[.+", "", slope_terms)
  fe_all = gsub("\\[.+", "", t)

  list(fixef_vars=fixef_vars, slope_vars=slope_vars, slope_fe=slope_fe, 
       slope_terms=slope_terms, fe_all=fe_all)
}


deparse_short = function(x){
  x_dp = deparse(x)
  if(length(x_dp) > 1){
    x_dp = paste0(x_dp, "...")
  }

  x_dp
}

deparse_long = function(x){
  dep_x = deparse(x, width.cutoff = 500)
  if(length(dep_x) == 1){
    return(dep_x)
  } else {
    return(paste(gsub("^ +", "", dep_x), collapse = ""))
  }
}

isVector = function(x){
  # it seems that when you subselect in data.table
  # sometimes it does not yield a vector
  # so i cannot use is.vector to check the consistency

  if(is.vector(x)){
    return(TRUE)
  } else {
    if(is.null(dim(x)) && !is.list(x)){
      return(TRUE)
    }
  }
  return(FALSE)
}


fill_with_na = function(x, object){
  if(is.null(object$obs_selection)){
    return(x)
  }

  res = rep(NA, object$nobs_origin)
  qui = 1:object$nobs_origin

  for(i in seq_along(object$obs_selection)){
    qui = qui[object$obs_selection[[i]]]
  }

  res[qui] = x

  return(res)
}

trim_obs_removed = function(x, object){
  if(is.null(object$obs_selection)){
    return(x)
  }

  for(i in seq_along(object$obs_selection)){
    x = x[object$obs_selection[[i]]]
  }

  x
}

is_operator = function(x, op) if(length(x) <= 1) FALSE else x[[1]] == op

fml_breaker = function(fml, op){
  res = list()
  k = 1
  while(is_operator(fml, op)){
    res[[k]] = fml[[3]]
    k = k + 1
    fml = fml[[2]]
  }
  res[[k]] = fml

  res
}

fml_maker = function(lhs, rhs){

  while(is_operator(lhs, "(")){
    lhs = lhs[[2]]
  }

  if(missing(rhs)){
    if(is_operator(lhs, "~")){
      return(lhs)
    }
    res = ~ .
    res[[2]] = lhs
  } else {

    while(is_operator(rhs, "(")){
      rhs = rhs[[2]]
    }

    res = . ~ .
    res[[2]] = lhs
    res[[3]] = rhs
  }

  res
}

fml_split_internal = function(fml, split.lhs = FALSE){

  fml_split_tilde = fml_breaker(fml, "~")
  k = length(fml_split_tilde)

  # NOTA in fml_breaker: to avoid copies, the order of elements returned is reversed

  # currently res is the LHS
  res = list(fml_split_tilde[[k]])

  if(k == 2){
    rhs = fml_breaker(fml_split_tilde[[1]], "|")
    l = length(rhs)

  } else if(k == 3){
    rhs  = fml_breaker(fml_split_tilde[[2]], "|")
    l = length(rhs)
    rhs_right = fml_breaker(fml_split_tilde[[1]], "|")

    if(length(rhs_right) > 1){
      stop_up("Problem in the formula: the formula in the RHS (expressing the IVs) cannot be multipart.")
    }

    # The rightmost element of the RHS is in position 1!!!!
    iv_fml = fml_maker(rhs[[1]], rhs_right[[1]])

    rhs[[1]] = iv_fml

  } else {
    # This is an error
    stop_up("Problem in the formula: you cannot have more than one RHS part containing a formula.")
  }

  if(!split.lhs){
    new_fml = fml_maker(res[[1]], rhs[[l]])

    res[[1]] = new_fml
    if(l == 1) return(res)
    res[2:l] = rhs[(l - 1):1]

  } else {
    res[1 + (1:l)] = rhs[l:1]
  }

  res
}


fml_split = function(fml, i, split.lhs = FALSE, text = FALSE, raw = FALSE){
  # I had to create that fonction to cope with the following formula:
  #
  #                       y ~ x1 | fe1 | u ~ z
  #
  # For Formula to work well, one would need to write insstead: y | x1 | fe1 | (u ~ z)
  # that set of parentheses are superfluous

  my_split = fml_split_internal(fml, split.lhs)

  if(raw){
    return(my_split)
  } else if(text){

    if(!missing(i)){
      return(deparse_long(my_split[[i]]))
    } else {
      return(sapply(my_split, deparse_long))
    }

  } else if(!missing(i)) {
    return(fml_maker(my_split[[i]]))

  } else {
    res = lapply(my_split, fml_maker)

    return(res)
  }

}

error_sender = function(expr, ..., clean, up = 0, arg_name){
  res = tryCatch(expr, error = function(e) structure(list(conditionCall(e), conditionMessage(e)), class = "try-error"))

  if("try-error" %in% class(res)){
    set_up(1 + up)
    msg = paste0(..., collapse = "")

    if(nchar(msg) == 0){
      if(missing(arg_name)){
        arg_name = deparse(substitute(expr))
      }
      msg = paste0("Argument `", arg_name, "` could not be evaluated: ")
      stop_up(msg, res[[2]])

    } else {

      # The expression that is evaluated is absolutely non informative for the user
      call_non_informative = deparse(substitute(expr), 100)[1]
      call_error = deparse(res[[1]], 100)[1]

      if(call_error == call_non_informative || call_error == "NULL" ||
         grepl("^(doTry|eval)", call_error)){
        call_error = ""

      } else {
        call_error = paste0("In ", call_error, ": ")
      }

      err = res[[2]]

      if(grepl("^in eval\\(str[^:]+:\n", err)){
        err = sub("^in eval\\(str[^:]+:\n", "", err)
      }

      if(!missing(clean)){

        if(grepl(" => ", clean)){
          clean_split = strsplit(clean, " => ")[[1]]
          from = clean_split[1]
          to = clean_split[2]
        } else {
          from = clean
          to = ""
        }
        
        err = gsub(from, to, err)
        call_error = gsub(from, to, call_error)

        stop_up(msg, "\n  ", call_error, err)
      } else {
        stop_up(msg, "\n  ", call_error, err)
      }
    }
  }

  res
}

is_fml_inside = function(fml){
  # we remove parentheses first

  while(is_operator(fml, "(")){
    fml = fml[[2]]
  }

  is_operator(fml, "~")
}


merge_fml = function(fml_linear, fml_fixef = NULL, fml_iv = NULL){

  is_fe = length(fml_fixef) > 0
  is_iv = length(fml_iv) > 0

  if(!is_fe && !is_iv){
    res = fml_linear
  } else {
    fml_all = deparse_long(fml_linear)

    if(is_fe){
      # we add parentheses if necessary
      if(is_operator(fml_fixef[[2]], "|")){
        fml_all[[2]] = paste0("(", as.character(fml_fixef)[2], ")")
      } else {
        fml_all[[2]] = as.character(fml_fixef)[2]
      }
    }

    if(is_iv) fml_all[[length(fml_all) + 1]] = deparse_long(fml_iv)

     res = as.formula(paste(fml_all, collapse = "|"), .GlobalEnv)
  }

  res
}


fixest_fml_rewriter = function(fml){
  # Currently performs the following
  # - expands lags
  # - protects powers: x^3 => I(x^3)
  #
  # fml = sw(f(y, 1:2)) ~ x1 + l(x2, 1:2) + x2^2 | fe1 | y ~ z::e + g^3
  # fml = y ~ 1 | id + period | l(x_endo, -1:1) ~ l(x_exo, -1:1)

  fml_text = deparse_long(fml)

  isPanel = grepl("(^|[^\\._[:alnum:]])(f|d|l)\\(", fml_text)
  isPower = grepl("^", fml_text, fixed = TRUE)

  if(isPanel){
    # We rewrite term-wise

    fml_parts = fml_split(fml, raw = TRUE)
    n_parts = length(fml_parts)

    #
    # LHS
    #

    # only panel: no power (bc no need), no interact

    # We tolerate multiple LHS and expansion
    lhs_text = fml_split(fml, 1, text = TRUE, split.lhs = TRUE)

    if(grepl("^(c|c?sw0?|list)\\(", lhs_text)){
      lhs_text2eval = gsub("^(c|c?sw0?|list)\\(", "sw(", lhs_text)
      lhs_names = eval(str2lang(lhs_text2eval))
    } else {
      lhs_names = lhs_text
    }

    lhs_all = error_sender(expand_lags_internal(lhs_names),
                           "Problem in the formula regarding lag/leads: ", 
                           clean = "__expand")

    if(length(lhs_all) > 1){
      lhs_fml = paste("c(", paste(lhs_all, collapse = ","), ")")
    } else {
      lhs_fml = lhs_all
    }

    lhs_text = lhs_fml

    #
    # RHS
    #

    # power + panel + interact

    if(isPower){
      # rhs actually also contains the LHS
      rhs_text = deparse_long(fml_parts[[1]])
      rhs_text = gsub("(?<!I\\()(\\b(\\.[[:alpha:]]|[[:alpha:]])[[:alnum:]\\._]*\\^[[:digit:]]+)", 
                      "I(\\1)", rhs_text, perl = TRUE)

      if(grepl("\\^[[:alpha:]]", rhs_text)){
        stop_up("The operator `^` between variables can be used only in the fixed-effects part of the formula. Otherwise, please use `:` instead.")
      }

      fml_rhs = as.formula(rhs_text)
    } else {
      fml_rhs = fml_maker(fml_parts[[1]])
    }

    rhs_terms = get_vars(fml_rhs)

    if(length(rhs_terms) == 0){
      rhs_text = "1"
    } else {
      rhs_terms = error_sender(expand_lags_internal(rhs_terms),
                               "Problem in the formula regarding lag/leads: ", 
                               clean = "__expand")

      if(attr(terms(fml_rhs), "intercept") == 0){
        rhs_terms = c("-1", rhs_terms)
      }

      rhs_text = paste(rhs_terms, collapse = "+")
    }

    fml_linear = as.formula(paste0(lhs_text, "~", rhs_text))

    #
    # FE + IV
    #

    fml_fixef = fml_iv = NULL

    if(n_parts > 1){

      #
      # FE
      #

      # Only isPanel (although odd....)

      is_fe = !is_fml_inside(fml_parts[[2]])
      if(is_fe){

        if(identical(fml_parts[[2]], 0)){
          fml_fixef = NULL

        } else {

          fml_fixef = fml_maker(fml_parts[[2]])
          fml_fixef_text = deparse_long(fml_fixef)

          if(grepl("(l|d|f)\\(", fml_fixef_text)){
            # We need to make changes
            # 1st: for terms to work => we change ^ if present (sigh)

            do_sub = grepl("^", fml_fixef_text, fixed = TRUE)

            if(do_sub){
              fml_fixef = as.formula(gsub("^", "%^%", fml_fixef_text, fixed = TRUE))
            }

            fixef_terms = attr(terms(fml_fixef), "term.labels")
            fixef_text = error_sender(expand_lags_internal(fixef_terms),
                                      "Problem in the formula regarding lag/leads: ", 
                                      clean = "__expand")

            if(do_sub){
              fixef_text = gsub(" %^% ", "^", fixef_text, fixed = TRUE)
            }

            fml_fixef = as.formula(paste("~", paste(fixef_text, collapse = "+")))

          }
        }
      }

      #
      # IV
      #

      if(n_parts == 3 || !is_fe){
        fml_iv = fml_maker(fml_parts[[n_parts]])

        fml_endo = .xpd(lhs = ~y, rhs = fml_iv[[2]])
        fml_inst = .xpd(lhs = ~y, rhs = fml_iv[[3]])

        endo_lag_expand = fixest_fml_rewriter(fml_endo)$fml
        inst_lag_expand = fixest_fml_rewriter(fml_inst)$fml

        fml_iv = .xpd(lhs = endo_lag_expand[[3]], rhs = inst_lag_expand[[3]])
      }
    }

    fml_new = merge_fml(fml_linear, fml_fixef, fml_iv)

  } else if(isPower){
    # This is damn costly.... but I have to deal with corner cases....
    # I think I should do that at the C level, this will be much faster I guess

    # We only take care of the RHS (we don't care about the LHS)
    no_lhs_text = gsub("^[^~]+~", "", fml_text)
    no_lhs_text = gsub("(?<!I\\()(\\b(\\.[[:alpha:]]|[[:alpha:]])[[:alnum:]\\._]*\\^[[:digit:]]+)", 
                       "I(\\1)", no_lhs_text, perl = TRUE)

    if(grepl("\\^[[:alpha:]]", no_lhs_text)){
      # We check if there is one ^ specifically in the RHS or in the IV part
      fml_parts = fml_split(fml, raw = TRUE)
      n_parts = length(fml_parts)

      rhs_txt = fml_split(fml, i = 1, text = TRUE)

      if(grepl("\\^[[:alpha:]]", rhs_txt)){
        stop_up("The operator `^` between variables can be used only in the fixed-effects part of the formula. Otherwise, please use `:` instead.")
      }

      if(is_fml_inside(fml_parts[[n_parts]])){
        iv_txt = fml_split(fml, i = n_parts, text = TRUE)

        if(grepl("\\^[[:alpha:]]", iv_txt)){
          stop_up("The operator `^` between variables can be used only in the fixed-effects part of the formula. Otherwise, please use `:` instead.")
        }
      }

    }

    fml_new = as.formula(paste0(gsub("~.+", "", fml_text), "~", no_lhs_text))

  } else {
    res = list(fml = fml, isPanel = FALSE)
    return(res)
  }

  res = list(fml = fml_new, isPanel = isPanel)

  return(res)
}


check_set_types = function(x, types, msg){
  arg_name = deparse(substitute(x))
  check_arg(x, "os formula | character vector no na", .arg_name = arg_name, .up = 1)

  if("formula" %in% class(x)){
    x = attr(terms(x), "term.labels")
  }

  check_set_value(x, "multi match", .choices = types, .arg_name = arg_name, .up = 1)

  x
}

check_set_digits = function(digits, up = 1){
  # Note that the argument name can be either digits or digits.stats

  set_up(up)
  check_value(digits, "integer scalar GE{1} | character scalar", 
              .arg_name = deparse(substitute(digits)))

  if(is.character(digits)){
    d_type = substr(digits, 1, 1)
    d_value = substr(digits, 2, nchar(digits))

    # Control
    if(!d_type %in% c("r", "s")){
      arg = deparse(substitute(digits))
      stop_up("The argument `", arg, "` must start with `r` (for round) or `s` (for significant). Currently it starts with `", d_type,"` which is not valid.\nExample of valid use: digits = `r3`.")
    }

    round = d_type == "r"

    if(!grepl("^[0-9]$", d_value)){
      arg = deparse(substitute(digits))
      stop_up("The argument `", arg, "` must be equal to the character `r` or `s` followed by a single digit. Currently `", digits,"` is not valid.\nExample of valid use: digits = `", d_type, "3`.")
    }

    digits = as.numeric(d_value)

  } else {
    round = FALSE
  }

  list(digits = digits, round = round)
}

get_vars = function(x){
  attr(terms(x), "term.labels")
}

mat_posdef_fix = function(X, tol = 1e-10){
  # X must be a symmetric matrix
  # We don't check it

  if(any(diag(X) < tol)){
    e = eigen(X, symmetric = TRUE)

    if (is.complex(e$values)) {
      attr(X, "is_complex") = TRUE
    } else {
      dm = dimnames(X)
      X = tcrossprod(e$vectors %*% diag(pmax(e$values, tol), nrow(X)), e$vectors)
      dimnames(X) = dm
    }
  }

  return(X)
}


is_fixest_call = function(){
  nf = sys.nframe()

  if(nf < 5) return(FALSE)

  last_calls = sapply(tail(sys.calls(), 13), function(x) deparse(x)[1])

  any(grepl("fixest", last_calls[-length(last_calls)]))
}

all_vars_with_i_prefix = function(fml){
  # fml = a ~ x1^x2 + i(x3, i.x4) + x5*i(x6, I(i.x7))

  vars = all.vars(fml)
  if(any(grepl("^i\\..+", vars))){
    fml_dp = deparse_long(fml)
    # for i. to work it MUST be the second argument (var)
    # valid cases:
    # - i(x1, i.x2)
    # - i(var = i.x2, x1)
    # - i(x1, i.I(x7))
    # Not valid:
    # - i(x1, I(i.x7))
    #
    # This means that in the parsed formula, i. is always preceded by a space and
    # either a "," or a "="

    # Maybe later: add nice error messages reminding how to use i.

    qui_i = which(grepl("^i\\..+", vars))
    i_vars = vars[qui_i]
    for(i in seq_along(i_vars)){
      fml_split = strsplit(fml_dp, i_vars[i], fixed = TRUE)[[1]]
      n = length(fml_split) - 1
      for(j in 1:n){
        part = fml_split[j]
        if(grepl("(,|var =) *$", part)){
          part = gsub("\\([^\\)]+\\)", "", part)
          if(grepl("i\\(", part)){
            # OK!
            ii = qui_i[i]
            vars[ii] = substr(vars[ii], 3, nchar(vars[ii]))
          }
        }
      }
    }
  }

  vars
}


# function to normalize character vectors into variable names
as_varname = function(x){
  # "x1" => "x1"
  # "(x1)" => "`(x1)`"


  qui_pblm = grepl("[^[:alnum:]\\._]", x)
  if(any(qui_pblm)){
    x[qui_pblm] = paste0("`", x[qui_pblm], "`")
    x[qui_pblm] = gsub("``", "`", x[qui_pblm])
  }

  x
}

update_file = function(path, text){

  if(file.exists(path)){
    text_1 = readLines(path)

    text_clean_1 = unlist(strsplit(text_1, "\n"))
    text_clean_2 = unlist(strsplit(text, "\n"))

    text_clean_1 = text_clean_1[grepl("[[:alnum:][:punct:]]", text_clean_1)]
    text_clean_2 = text_clean_2[grepl("[[:alnum:][:punct:]]", text_clean_2)]

    do_write = length(text_clean_1) != length(text_clean_2) || any(text_clean_1 != text_clean_2)

  } else {
    do_write = TRUE
  }

  # We write only if the text is different
  if(do_write){
    message("writing '", path, "'", appendLF = FALSE)
    f = file(path, "w", encoding = "utf-8")
    writeLines(text, f)
    close(f)
    message(".")
  }
}


fetch_arg_deparse = function(arg){
  # Utility to find out what the user originally provided as argument
  # Only useful for functions that are deeply nested (typically vcov)

  sc = rev(sys.calls())

  # initialization of the name
  arg_name = deparse_long(sc[[2]][[arg]])

  n = length(sc)
  if(n > 2){
    for(i in 2:length(sc)){
      mc = sc[[i]]
      if(grepl("[tT]ry", mc[[1]])) next
      if(!arg %in% names(mc)) break
      arg_name = deparse_long(mc[[arg]])
    }
  }

  arg_name
}



# Simple utilities to avoid writing too much
# adds a small overhead but that's OK (about 1us which is OK)
any_missing = function(x1, x2, x3, x4, x5, x6){
  n = length(sys.call()) - 1
  switch(n,
       "1" = missing(x1),
       "2" = missing(x1) || missing(x2),
       "3" = missing(x1) || missing(x2) || missing(x3),
       "4" = missing(x1) || missing(x2) || missing(x3) || missing(x4),
       "5" = missing(x1) || missing(x2) || missing(x3) || missing(x4) || missing(x5),
       "6" = missing(x1) || missing(x2) || missing(x3) || missing(x4) || missing(x5) || missing(x6))
}

all_missing = function(x1, x2, x3, x4, x5, x6){
  n = length(sys.call()) - 1
  switch(n,
       "1" = missing(x1),
       "2" = missing(x1) && missing(x2),
       "3" = missing(x1) && missing(x2) && missing(x3),
       "4" = missing(x1) && missing(x2) && missing(x3) && missing(x4),
       "5" = missing(x1) && missing(x2) && missing(x3) && missing(x4) && missing(x5),
       "6" = missing(x1) && missing(x2) && missing(x3) && missing(x4) && missing(x5) && missing(x6))
}

use_t_distr = function(x){
  # whether to use the t-distribution or the normal
  # x: fixest estimation
  x$method_type %in% "feols" || (x$method_type %in% "feglm" && !x$family$family %in% c("poisson", "binomial"))
}


is_fun_in_char = function(fml_char, fun){
  extract_fun(fml_char, fun, bool = TRUE)
}

extract_fun = function(fml_char, fun, err_msg = NULL, bool = FALSE, drop = TRUE){
  # A formula, in character form // can be a vector
  # fun: a function name in regex form
  #
  # returns a list:
  # before
  # fun
  # after
  #
  # fml_char = "to_integer(id, sw0(fe))" ;  fun = "sw0?"

  only_one = TRUE

  regex = paste0(c("(?<=^)", "(?<=[^[:alnum:]\\._])"),
           fun, "\\(", collapse = "|")

  is_there = grepl(regex, fml_char, perl = TRUE)

  if(bool) return(is_there)

  res = list()

  n = length(fml_char)
  for(i in 1:n){
    # fml_i = fml_char[1]

    fml_i = fml_char[i]
    if(is_there[i]){

      fml_split = strsplit(fml_i, regex, perl = TRUE)[[1]]

      if(only_one && length(fml_split) > 2){
        if(is.null(err_msg)){
          stop_up("Only one function `", fun, "` can be used at a time.")
        } else {
          stop_up(err_msg)
        }
      }

      # we need to add a last character otherwise => problems
      fml_value = paste0(fml_split[2], " ")
      fml_right_split = strsplit(fml_value, ")", fixed = TRUE)[[1]]
      n_open = pmax(lengths(strsplit(fml_right_split, "(", fixed = TRUE)) - 1, 0)

      n_parts = length(fml_right_split)
      fun_closed = which((1 + cumsum(n_open) - 1:n_parts) == 0)

      fun_name = substr(fml_i, nchar(fml_split[1]) + 1, nchar(fml_i))
      fun_name = gsub("\\(.+", "", fun_name)

      fun_char = paste0(fun_name, "(", paste(fml_right_split[1:fun_closed], collapse = ")"), ")")

      fml_right_rest = trimws(paste0(fml_right_split[-(1:fun_closed)], collapse = ")"))

      res[[i]] = list(before = fml_split[1],
              fun = fun_char,
              after = fml_right_rest)


    } else {
      res[[i]] = list(before = fml_i,
              fun = "",
              after = "")
    }
  }

  if(n == 1 && drop) res = res[[1]]

  return(res)
}

is_numeric_in_char = function(x){
  res = tryCatch(as.numeric(x), warning = function(x) "not numeric")
  !identical(res, "not numeric")
}

insert_in_between = function(x, y){
  n_x = length(x)
  n_y = length(y)

  if(n_y == 1) y = rep(y, n_x)
  if(n_x == 1) x = rep(x, n_y)
  n_x = length(x)

  res = rep(x, each = 2)
  res[2 * 1:n_x] = y

  res
}

str_trim = function(x, n_first = 0, n_last = 0){
  # positive values: first
  # negative values: last

  if(is.character(n_first)){
    n_first = nchar(n_first)
  } else if(n_first < 0){
    n_last = -n_first
    n_first = 0
  }

  substr(x, 1 + n_first, nchar(x) - n_last)
}

str_split = function(x, split){
  # Simple wrapper

  fixed = TRUE
  perl = FALSE
  if(grepl("@", split, fixed = TRUE)){
    if(grepl("^\\\\@", split)){
      split = str_trim(split, 1)
    } else if(grepl("^@", split)){
      split = str_trim(split, 1)
      fixed = FALSE
      perl = TRUE
    }
  }

  strsplit(x, split, fixed = fixed, perl = perl)
}

NA_fun = function(..., df){
  dots = list(...)
  n_dots = length(dots)

  if(n_dots == 0){
    res = !complete.cases(df)
    return(res)
  }

  res = is.na(dots[[1]])

  if(n_dots == 1){
    return(res)
  }

  for(i in 2:n_dots){
    res = res | is.na(dots[[i]])
  }
  res
}


eval_dot = function(x, up = 1){
  # Note that the right use of up is ESSENTIAL
  # it must refer to the parent frame from which the main call has been made
  # Otherwise an error will be thrown of "." not existing

  x_dp = deparse(substitute(x), 300)

  sysOrigin = sys.parent(up)
  mc = match.call(sys.function(sysOrigin), sys.call(sysOrigin))

  if(!x_dp %in% names(mc)){
    return(x)
  }

  my_list = list("." = list)

  eval(mc[[x_dp]], my_list, parent.frame(up + 1))
}


is_user_level_call = function(){
  length(sys.calls()) <= 2
}


is_calling_fun = function(pattern, full_search = FALSE, full_name = FALSE){
  sc_all = sys.calls()
  n_sc = length(sc_all)
  if(n_sc > 2){

    if(full_search){
      fun_all = sapply(tail(sc_all, 13), function(x) deparse(x)[1])

      if(full_name){
        pattern = sma("^{pattern}\\(")
      }

      res = any(grepl(pattern, fun_all))
    } else {
      if(grepl(".fixest", sc_all[[n_sc - 1]][[1]], fixed = TRUE)){
        if(n_sc == 3){
          return(FALSE)
        }

        sc = sc_all[[n_sc - 3]]
      } else {
        sc = sc_all[[n_sc - 2]]
      }

      fun_name = deparse(sc)[1]
      if(full_name){
        pattern = sma("^{pattern}\\(")
      }

      res = grepl(pattern, fun_name)
    }

    return(res)
  }

  FALSE
}

# x = ~a + (b + c) + d*k
extract_vars_simple = function(x){
  # We don't apply terms!
  # => m * (a + b) is not expanded
  # a bit slow

  check_arg(x, "formula")

  res = c()
  x = x[[length(x)]]

  while(is_operator(x, "+")){
    res = c(res, deparse_long(x[[3]]))
    x = x[[2]]
  }

  res = c(res, deparse_long(x))

  rev(res)
}

# x = "a + (b + c) + d*k"
char_to_vars = function(x){

  check_arg(x, "character scalar")

  if(!grepl("(", x, fixed = TRUE)){
    res = trimws(strsplit(x, "+", fixed = TRUE)[[1]])
    return(res)
  }

  res = c()
  x = str2lang(x)

  while(is_operator(x, "+")){
    res = c(res, deparse_long(x[[3]]))
    x = x[[2]]
  }

  res = c(res, deparse_long(x))

  rev(res)
}

not_too_many_messages = function(key){
  # we don't proc in less than 2 seconds
  # avoid ugly looping issues
  all_times = getOption("fixest_all_timings")
  old_time = all_times[[key]]
  if(is.null(old_time) || as.numeric(Sys.time() - old_time) > 2){
    if(is.null(all_times)){
      all_times = list()
    }
    all_times[[key]] = Sys.time()
    options(fixest_all_timings = all_times)
    return(TRUE)
  }

  FALSE
}


#### --------------- ####
#### Setters/Getters ####
####



#' Sets/gets whether to display notes in `fixest` estimation functions
#'
#' Sets/gets the default values of whether notes (informing for NA and observations removed) should be displayed in `fixest` estimation functions.
#'
#' @param x A logical. If `FALSE`, then notes are permanently removed.
#'
#' @author
#' Laurent Berge
#'
#' @examples
#'
#' # Change default with
#' setFixest_notes(FALSE)
#' feols(Ozone ~ Solar.R, airquality)
#'
#' # Back to default which is TRUE
#' setFixest_notes(TRUE)
#' feols(Ozone ~ Solar.R, airquality)
#'
setFixest_notes = function(x){
  check_arg(x, "mbt logical scalar")

  options("fixest_notes" = x)
}

#' @rdname setFixest_notes
getFixest_notes = function(){

  x = getOption("fixest_notes")
  if(length(x) != 1 || !is.logical(x) || is.na(x)){
    stop("The value of getOption(\"fixest_notes\") is currently not legal. Please use function setFixest_notes to set it to an appropriate value. ")
  }

  x
}

#' Sets/gets the number of threads to use in `fixest` functions
#'
#' Sets/gets the default number of threads to used in `fixest` estimation functions. The default is the maximum number of threads minus two.
#'
#'
#'
#' @param nthreads The number of threads. Can be: a) an integer lower than, or equal to, the 
#' maximum number of threads; b) 0: meaning all available threads will be used; c) a number 
#' strictly between 0 and 1 which represents the fraction of all threads to use. If missing, the 
#' default is to use 50% of all threads.
#' @param save Either a logical or equal to `"reset"`. Default is `FALSE`. If `TRUE` then the value 
#' is set permanently at the project level, this means that if you restart R, you will still obtain 
#' the previously saved defaults. This is done by writing in the `".Renviron"` file, located in the 
#' project's working directory, hence we must have write permission there for this to work, and 
#' only works with Rstudio. If equal to "reset", the default at the project level is erased. Since 
#' there is writing in a file involved, permission is asked to the user.
#'
#' @author
#' Laurent Berge
#'
#'
#' @examples
#'
#' # Gets the current number of threads
#' (nthreads_origin = getFixest_nthreads())
#'
#' # To set multi-threading off:
#' setFixest_nthreads(1)
#'
#' # To set it back to default at startup:
#' setFixest_nthreads()
#'
#' # And back to the original value
#' setFixest_nthreads(nthreads_origin)
#'
#'
setFixest_nthreads = function(nthreads, save = FALSE){
  # By default, we use only 50% of threads (never use all)

  max_CRAN = as.numeric(Sys.getenv("OMP_THREAD_LIMIT"))
  max_CRAN[is.na(max_CRAN)] = 1000

  max_threads = min(cpp_get_nb_threads(), 1000, max_CRAN) # we cap at 1k nthreads

  check_set_arg(save, "logical scalar | match(reset)")

  do_reset = identical(save, "reset")

  if(missing(nthreads) || is.null(nthreads)){
    # We first get the default from the environment variable
    # If it is missing => 50% of all threads

    # 0.5 => 50% of all available threads (usually equiv to the nber of procs)

    nthreads_default = renvir_get("fixest_nthreads")

    if(!do_reset && !is.null(nthreads_default)){
      if(!isScalar(nthreads_default) || nthreads_default < 0){
        warning("The variable setting the number of threads in the .Renviron file is corrupted. It's value has been reset.")
        renvir_update("fixest_nthreads", NULL)
        nthreads_default = 0.5
      }

    } else {
      nthreads_default = 0.5
    }

    nthreads = check_set_nthreads(nthreads_default)

  }

  nthreads = check_set_nthreads(nthreads)

  if(do_reset){
    renvir_update("fixest_nthreads", NULL)
  } else if(save){
    renvir_update("fixest_nthreads", nthreads)
  }

  options("fixest_nthreads" = nthreads)

  invisible()
}

#' @rdname setFixest_nthreads
getFixest_nthreads = function(){

  x = getOption("fixest_nthreads")
  if(length(x) != 1 || !is.numeric(x) || is.na(x) || x %% 1 != 0 || x < 0){
    stop("The value of getOption(\"fixest_nthreads\") is currently not legal. Please use function setFixest_nthreads to set it to an appropriate value. ")
  }

  x
}



#' Transforms a character string into a dictionary
#'
#' Transforms a single character string containing a dictionary in a textual format into a proper dictionary, that is a named character vector
#'
#' @param x A character scalar of the form `"variable 1: definition \n variable 2: definition"` 
#' etc. Each line of this character must contain at most one definition with, on the left the 
#' variable name, and on the right its definition. The separation between the variable and its 
#' definition must be a colon followed with a single space (i.e. ": "). You can stack definitions 
#' within a single line by making use of a semi colon: `"var1: def; var2: def"`. White spaces on 
#' the left and right are ignored. You can add commented lines with a `"#"`. Non-empty, 
#' non-commented lines that don't have the proper format witll raise an error.
#'
#' @details
#' This function is mostly used in combination with [`setFixest_dict`] to set the dictionary to be 
#' used in the function [`etable`].
#'
#' @return
#' It returns a named character vector.
#'
#' @author
#' Laurent Berge
#'
#' @seealso
#' [`etable`], [`setFixest_dict`]
#'
#' @examples
#'
#' x = "# Main vars
#'      mpg: Miles per gallon
#'      hp: Horsepower
#'
#'      # Categorical variables
#'      cyl: Number of cylinders; vs: Engine"
#'
#' as.dict(x)
#'
#'
#'
as.dict = function(x){
  check_arg(x, "character scalar")

  text_split = strsplit(x, ";|\n")[[1]]

  text_clean = trimws(text_split)
  text_clean = text_clean[nchar(text_clean) > 0 & !grepl("^#", text_clean)]

  if(any(!grepl(": ", text_clean))){
    stop("All definitions must be of the form 'variable: definition', the colon and space are indispensible. Currently the following line is not valid:\n", text_clean[!grepl(": ", text_clean)][1])
  }

  dict_names = sapply(strsplit(text_clean, ": ", fixed = TRUE), `[[`, 1)
  if(anyDuplicated(dict_names)){
    tb = sort(table(dict_names), decreasing = TRUE)[1]
    stop("The names contain duplicate entries: this is not allowed. The value ",
       names(tb), " appears ", dreamerr::n_times(tb), ".")
  }

  values = substr(text_clean, nchar(dict_names) + 3, nchar(text_clean))

  setNames(values, dict_names)
}

#' Sets/gets the dictionary relabeling the variables
#'
#' Sets/gets the default dictionary used in the function [`etable`], [`did_means`] and 
#' [`coefplot`]. The dictionaries are used to relabel variables (usually towards a fancier, more 
#' explicit formatting) when exporting them into a Latex table or displaying in graphs. By setting 
#' the dictionary with `setFixest_dict`, you can avoid providing the argument `dict`.
#'
#'
#' @param dict A named character vector or a character scalar. E.g. to change my variable named "a" 
#' and "b" to (resp.) "$log(a)$" and "$bonus^3$", then use 
#' `dict = c(a="$log(a)$", b3="$bonus^3$")`. Alternatively you can feed a character scalar 
#' containing the dictionary in the form `"variable 1: definition \n variable 2: definition"`. In 
#' that case the function [`as.dict`] will be applied to get a proper dictionary. This dictionary 
#' is used in Latex tables or in graphs by the function [`coefplot`]. If you want to separate Latex 
#' rendering from rendering in graphs, use an ampersand first to make the variable specific to 
#' `coefplot`.
#' @param ... You can add arguments of the form: `variable_name = "Definition"`. This is an 
#' alternative to using a named vector in the argument `dict`.
#' @param reset Logical, default is `FALSE`. If `TRUE`, then the dictionary is reset. Note that the 
#' default dictionary always relabels the variable "(Intercept)" in to "Constant". To overwrite it, 
#' you need to add "(Intercept)" explicitly in your dictionary.
#'
#' @details
#' By default the dictionary only grows. This means that successive calls with not erase the 
#' previous definitions unless the argument `reset` has been set to `TRUE`.
#'
#' The default dictionary is equivalent to having `setFixest_dict("(Intercept)" = "Constant")`. To 
#' change this default, you need to provide a new definition to `"(Intercept)"` explicitly.
#'
#' @author
#' Laurent Berge
#'
#'
#' @examples
#'
#' data(trade)
#' est = feols(log(Euros) ~ log(dist_km)|Origin+Destination+Product, trade)
#' # we export the result & rename some variables
#' etable(est, dict = c("log(Euros)"="Euros (ln)", Origin="Country of Origin"))
#'
#' # If you export many tables, it can be more convenient to use setFixest_dict:
#' setFixest_dict(c("log(Euros)"="Euros (ln)", Origin="Country of Origin"))
#' etable(est) # variables are properly relabeled
#'
#' # The dictionary only 'grows'
#' # Here you get the previous two variables + the new one that are relabeled
#' # Btw you set the dictionary directly using the argument names:
#' setFixest_dict(Destination = "Country of Destination")
#' etable(est)
#'
#' # Another way to set a dictionary: with a character string:
#' # See the help page of as.dict
#' dict = "log(dist_km): Distance (ln); Product: Type of Good"
#' setFixest_dict(dict)
#' etable(est)
#'
#' # And now we reset:
#' setFixest_dict(reset = TRUE)
#' etable(est)
#'
setFixest_dict = function(dict = NULL, ..., reset = FALSE){

  check_arg(dict, "NULL named character vector no na | character scalar")

  if(is.null(names(dict)) && !is.null(dict)){
    dict = error_sender(as.dict(dict), "In setFixest_dict, problem when coercing the dictionay with as.dict().")
  }

  check_arg(..., "dotnames character scalar",
            .message = sma("In '...', each argument must be named. ",
                           "The argument name corresponds to the variable to be renamed while",
                           " the value must be a character scalar ",
                           "(how the variable should be renamed)."))

  dots = list(...)
  dict = as.list(dict)
  dict[names(dots)] = dots

  if(reset){
    core_dict = list("(Intercept)" = "Constant")
  } else {
    core_dict = getOption("fixest_dict")
    if(is.null(core_dict)) core_dict = list()
  }

  core_dict[names(dict)] = dict

  options("fixest_dict" = unlist(core_dict))
}

#' @rdname setFixest_dict
getFixest_dict = function(){

  x = getOption("fixest_dict")
  if(length(x) > 0){
    if(!is.character(x) || !isVector(x) || anyNA(x)){
      stop("The value of getOption(\"fixest_dict\") is currently not legal. Please use function setFixest_dict to set it to an appropriate value. ")
    }
  }

  x
}


#' @rdname print.fixest
setFixest_print = function(type = "table", fitstat = NULL){

  check_set_arg(type, "match(coef, table)")

  if(!missnull(fitstat)){
    fitstat = fitstat_validate(fitstat, TRUE)
  }

  # Getting the existing defaults
  opts = getOption("fixest_print")

  if(is.null(opts)){
    opts = list()
  } else if(!is.list(opts)){
    warning("Wrong formatting of option 'fixest_print', all options are reset.")
    opts = list()
  }

  # Saving the default values
  mc = match.call()
  args_default = names(mc)[-1]

  # NOTA: we don't allow delayed evaluation => all arguments must have hard values
  for(v in args_default){
    opts[[v]] = eval(as.name(v))
  }

  options(fixest_print = opts)
}


#' @rdname print.fixest
getFixest_print = function(){

  x = getOption("fixest_print")
  if(!(is.null(x) || is.list(x))){
    stop("The value of getOption(\"fixest_print\") is currently not legal. Please use function setFixest_print to set it to an appropriate value. ")
  }

  x
}



#' Sets/gets formula macros
#'
#' You can set formula macros globally with `setFixest_fml`. These macros can then be used in `fixest` estimations or when using the function [`xpd`][fixest::setFixest_fml].
#'
#' @inherit xpd examples
#'
#' @param ... Definition of the macro variables. Each argument name corresponds to the name of the 
#' macro variable. It is required that each macro variable name starts with two dots 
#' (e.g. `..ctrl`). The value of each argument must be a one-sided formula or a character vector, 
#' it is the definition of the macro variable. Example of a valid call: 
#' `setFixest_fml(..ctrl = ~ var1 + var2)`. In the function `xpd`, the default macro variables are 
#' taken from `getFixest_fml`, any variable in `...` will replace these values. You can enclose 
#' values in `.[]`, if so they will be evaluated from the current environment. 
#' For example `..ctrl = ~ x.[1:2] + .[z]` will lead to `~x1 + x2 + var` if `z` is equal to `"var"`.
#' @param reset A logical scalar, defaults to `FALSE`. If `TRUE`, 
#' all macro variables are first reset (i.e. deleted).
#'
#' @details
#' In `xpd`, the default macro variables are taken from `getFixest_fml`. 
#' Any value in the `...` argument of `xpd` will replace these default values.
#'
#' The definitions of the macro variables will replace in verbatim the macro variables. 
#' Therefore, you can include multipart formulas if you wish but then beware of the order the 
#' macros variable in the formula. For example, using the airquality data, say you want to set as 
#' controls the variable `Temp` and `Day` fixed-effects, you can do 
#' `setFixest_fml(..ctrl = ~Temp | Day)`, but then
#'  `feols(Ozone ~ Wind + ..ctrl, airquality)` will be quite different from 
#' `feols(Ozone ~ ..ctrl + Wind, airquality)`, so beware!
#'
#' @return
#' The function `getFixest_fml()` returns a list of character strings, the names 
#' corresponding to the macro variable names, the character strings corresponding 
#' to their definition.
#'
#' @seealso
#' [`xpd`] to make use of formula macros.
#'
#'
#'
setFixest_fml = function(..., reset = FALSE){

  check_arg(reset, get, "logical scalar")

  fml_macro = parse_macros(..., reset = reset, frame = parent.frame())

  options("fixest_fml_macro" = fml_macro)

}

#' @rdname setFixest_fml
getFixest_fml = function(){
  fml_macro = getOption("fixest_fml_macro")
  if(is.null(fml_macro)){
    options("fixest_fml_macro" = list())
    fml_macro = list()
  } else if(!is.list(fml_macro)){
    options("fixest_fml_macro" = list())
    warning("The value of getOption(\"fixest_fml_macro\") is not legal, it has been reset. Please use only 'setFixest_fml' to set formula macro variables.")
    fml_macro = list()
  }

  fml_macro
}



#' Default arguments for fixest estimations
#'
#' This function sets globally the default arguments of fixest estimations.
#'
#' @inheritParams feols
#' @inheritParams feNmlm
#' @inheritParams feglm
#'
#' @param reset Logical scalar, default is `FALSE`. Whether to reset all values.
#'
#' @return
#' The function `getFixest_estimation` returns the currently set global defaults.
#'
#' @examples
#'
#' #
#' # Example: removing singletons is FALSE by default
#' #
#'
#' # => changing this default
#'
#' # Let's create data with singletons
#' base = iris
#' names(base) = c("y", "x1", "x2", "x3", "species")
#' base$fe_singletons = as.character(base$species)
#' base$fe_singletons[1:5] = letters[1:5]
#'
#' res          = feols(y ~ x1 + x2 | fe_singletons, base)
#' res_noSingle = feols(y ~ x1 + x2 | fe_singletons, base, fixef.rm = "single")
#'
#' # New defaults
#' setFixest_estimation(fixef.rm = "single")
#' res_newDefault = feols(y ~ x1 + x2 | fe_singletons, base)
#'
#' etable(res, res_noSingle, res_newDefault)
#'
#' # Resetting the defaults
#' setFixest_estimation(reset = TRUE)
#'
#'
#'
setFixest_estimation = function(data = NULL, panel.id = NULL, fixef.rm = "perfect",
                                fixef.tol = 1e-6, fixef.iter = 10000, collin.tol = 1e-10,
                                lean = FALSE, verbose = 0, warn = TRUE, combine.quick = NULL,
                                demeaned = FALSE, mem.clean = FALSE, glm.iter = 25,
                                glm.tol = 1e-8, data.save = FALSE, reset = FALSE){

  check_set_arg(fixef.rm, "match(none, perfect, singleton, both)")
  check_arg(fixef.tol, collin.tol, glm.tol, "numeric scalar GT{0}")
  check_arg(fixef.iter, glm.iter, "integer scalar GE{1}")
  check_arg(verbose, "integer scalar GE{0}")
  check_arg(lean, warn, demeaned, mem.clean, reset, data.save, "logical scalar")
  check_arg(combine.quick, "NULL logical scalar")
  check_arg(panel.id, "NULL character vector len(,2) no na | os formula")

  if(!missing(data)){
    check_arg(data, "NULL data.frame | matrix")
    if(!is.null(data)){
      data = deparse_long(substitute(data))
      class(data) = "default_data"
    }
  }

  # Getting the existing defaults
  opts = getOption("fixest_estimation")

  if(reset || is.null(opts)){
    opts = list()
  } else if(!is.list(opts)){
    warning("Wrong formatting of option 'fixest_estimation', all options are reset.")
    opts = list()
  }

  # Saving the default values
  mc = match.call()
  args_default = setdiff(names(mc)[-1], "reset")

  # NOTA: we don't allow delayed evaluation => all arguments must have hard values
  for(v in args_default){
    opts[[v]] = eval(as.name(v))
  }

  options(fixest_estimation = opts)

}

#' @rdname setFixest_estimation
getFixest_estimation = function(){
  getOption("fixest_estimation")
}


#' Sets properties of `fixest_multi` objects
#' 
#' Use this function to change the default behavior of `fixest_multi` objects.
#' 
#' @param drop Logical scalar, default is `FALSE`. Provides the default value of the argument 
#' `drop` when subsetting `fixest_multi` objects.
#' 
#' 
#' @return 
#' The function `getFixest_multi()` returns the list of settings.
#' 
#' @examples 
#' 
#' # 1) let's run a multiple estimation
#' base = setNames(iris, c("y", "x1", "x2", "x3", "species"))
#' est = feols(y ~ csw(x1, x2, x3), base)
#' 
#' # 2) let's pick a single estimation => by default we have a `fixest_multi` object
#' class(est[rhs = 2])
#' 
#' # `drop = TRUE` would have led to a `fixest` object
#' class(est[rhs = 2, drop = TRUE])
#' 
#' # 3) change the default behavior
#' setFixest_multi(drop = TRUE)
#' class(est[rhs = 2])
#' 
#' 
setFixest_multi = function(drop = FALSE){
  check_arg(drop, "logical scalar")
  
  opts = list(drop = drop)
  options(fixest_multi_opts = opts)
}

#' @rdname setFixest_multi
getFixest_multi = function(){

  x = getOption("fixest_multi_opts")
  
  if(is.null(x)){
    setFixest_multi()
    x = getOption("fixest_multi_opts")
  }
  
  if(!(is.null(x) || is.list(x))){
    stop("The value of getOption(\"fixest_multi_opts\") is currently not legal. Please use function `setFixest_multi` to set it to an appropriate value. ")
  }

  x
}


#### .................. ####
#### DOCUMENTATION DATA ####
####



#' Trade data sample
#'
#' This data reports trade information between countries of the European Union (EU15).
#'
#' @usage
#' data(trade)
#'
#' @format
#' `trade` is a data frame with 38,325 observations and 6 variables named `Destination`, `Origin`, `Product`, `Year`, `dist_km` and `Euros`.
#'
#' * Origin: 2-digits codes of the countries of origin of the trade flow.
#' * Destination: 2-digits codes of the countries of destination of the trade flow.
#' * Products: Number representing the product categories (from 1 to 20).
#' * Year: Years from 2007 to 2016
#' * dist_km: Geographic distance in km between the centers of the countries of origin and destination.
#' * Euros: The total amount in euros of the trade flow for the specific year/product category/origin-destination country pair.
#'
#'
#' @source
#' This data has been extrated from Eurostat on October 2017.
#'
#'
#'
"trade"

Try the fixest package in your browser

Any scripts or data that you put into this service are public.

fixest documentation built on June 22, 2024, 9:12 a.m.