R/forwardSearch_regL1.R
In diagL1: Routines for Fit, Inference and Diagnostics in Linear L1 and LAD Models

Documented in forwardSearch_regL1

#main author: Kévin Allan Sales Rodrigues

#' Forward Search in Linear L1 Models
#'
#' This function applies the forward search approach to robust analysis in linear L1 models. This function is based on function \code{fwdlm} of package 'forward'.
#'
#' @param formula a formula object, with the response on the left of a ~ operator, and the terms, separated by + operators, on the right.
#' @param data a data.frame in which to interpret the variables named in the formula, or in the subset and the weights argument. If this is missing, then the variables in the formula should be on the search list. This may also be a single number to handle some special cases – see below for details.
#' @param nsamp the initial subset for the forward search in linear regression is found by fitting the regression model with the R function \code{\link{lmsreg}}. This argument allows to control how many subsets are used in the Least Median of Squares regression. The choices are: the number of samples or "best" (the default) or "exact" or "sample". For details see \code{\link{lmsreg}}.
#' @param intercept logical for the inclusion of the intercept (if no formula is provided).
#' @param trace logical, if TRUE a message is printed for every ten iterations completed during the forward search.
#' @param subset an optional vector specifying a subset of observations to be used in the fitting process.
#' @param weights vector of observation weights; if supplied, the algorithm fits to minimize the sum of the weights multiplied into the absolute residuals. The length of weights must be the same as the number of observations. The weights must be nonnegative and it is strongly recommended that they be strictly positive, since zero weights are ambiguous.
#' @param na.action a function to filter missing data. This is applied to the model.frame after any subset argument has been used. The default (with na.fail) is to create an error if any missing values are found. A possible alternative is na.omit, which deletes observations that contain one or more missing values.
#' @param method the algorithmic method used to compute the fit. There are several options: "br", "fn", "pfn", "sfn", "fnc", "conquer", "pfnb", "qfnb", "ppro" and "lasso". See \code{\link{rq}} for more details.
#' @param model if TRUE then the model frame is returned. This is essential if one wants to call summary subsequently.
#' @param contrasts a list giving contrasts for some or all of the factors default = NULL appearing in the model formula. The elements of the list should have the same name as the variable and should be either a contrast matrix (specifically, any full-rank matrix with as many rows as there are levels in the factor), or else a function to compute such a matrix given the number of levels.
#' @param ... additional arguments for the fitting routines (see \code{\link{rq.fit.br}} and \code{\link{rq.fit.fnb}}, etc. and the functions they call).
#'
#' @returns A fitted forward search in linear L1 regression model object.
#'
#' @importFrom MASS lmsreg
#'
#' @export
#'
#' @references Atkinson, A.C. and Riani, M. (2000). \emph{Robust Diagnostic Regression Analysis}. New York: Springer.
#'
#'
#' @examples
#' \donttest{
#' # applies the forward search approach to robust analysis in a linear L1 model
#' mod = forwardSearch_regL1(Concentration ~ Age, data = bile)
#' }

forwardSearch_regL1 = function(formula, data, nsamp = "best",
                               intercept = TRUE, trace = TRUE,
                               subset, weights, na.action, method = "br", #argumentos padrão
                               model = TRUE, contrasts = NULL, ...)
{
  tau = 0.5 # to L1 regression
  if (!missing(formula))
  {
    m <- call("model.frame", formula = formula)

    if (!missing(data))
      m$data <- data

    if (!missing(na.action))
      m$na.action <- na.action

    m <- eval(m, parent.frame())
    Terms <- attr(m, "terms")
    xvars <- as.character(attr(Terms, "variables"))

    if ((yvar <- attr(Terms, "response")) > 0)
      xvars <- xvars[ - yvar]
    if (length(xvars) > 0)
    {
      xlevels <- lapply(m[xvars], levels)
      xlevels <- xlevels[!sapply(xlevels, is.null)]
      if (length(xlevels) == 0)
        xlevels <- NULL
    }
    else
      xlevels <- NULL
    y <- model.response(m, "numeric")
    x <- model.matrix(Terms, m, NULL)
  }
  else
  {
    if (is.null(x) || is.null(y))
      stop("No data supplied")
    y <- as.vector(y)
    x <- as.matrix(x)
    if (intercept)
      x <- cbind(1,x)
  }

  n = nrow(x)
  p = ncol(x)

  ## We supress warnings generated by lmsreg.default, e.g.
  ## More than half of the subsamples were singular.

  temp.warn <- options()$warn
  options(warn = -1)
  lms <- MASS::lmsreg(x, y, nsamp = nsamp, intercept = FALSE)
  options(warn = temp.warn)

  sor <- order(abs(lms$residuals))
  coef.names <- names(lms$coefficients)

  # included = all cases included at each step ##
  included <- list()

  # Unit = Unit(s) included in each step
  Unit <- matrix(as.integer(NA), nrow = n-p+1, ncol = 5)

  # Coefficients = estimated beta Coefficients
  Coefficients <- matrix(as.double(NA), nrow = n - p + 1, ncol = p)

  # tStatistics = t statistics
  tStatistics <- matrix(as.double(NA), nrow = n - p, ncol = p)

  # MAE - Mean Absolute Error
  MAE <- matrix(as.double(NA), nrow = n - p + 1, ncol = 1)

  # R2
  # R2 <- matrix(as.double(NA), nrow = n - p + 1, ncol = 1)

  ## MinDelRes = Minimum deletion residual
  MinDelRes <- matrix(as.double(NA), n - p - 1, 1)

  ## Residuals = Residuals in each step
  Residuals <- matrix(as.double(NA), n, n-p+1)

  residuals <- lms$residuals
  inibsb <- integer(0)
  n1 <- 1:n

  if (trace)
    message("Starting Forward Search.\n")

  for (m in p:n)
  {
    if(trace && m %% 10 == 0)
      message(paste("Forward Search: finished ", m, " steps out of ", n, ".\n", sep = ""))

    bsb <- sor[1:m]

    included[[m-p+1]] <- sort(bsb)  # added info

    unit <- setdiff(bsb, inibsb)
    if(length(unit) > 5)
      Unit[m-p+1, ] <- unit[1:5]
    else
      Unit[m-p+1, 1:length(unit)] <- unit

    inibsb <- bsb

    xb <- x[bsb,]
    yb <- y[bsb]

    xb_without_ones = as.matrix(xb)[,-1] # removing ones column

    zz <- try(regL1(yb ~ xb_without_ones))

    if (!inherits(zz,"try-error") )# && (zz$qr$rank == p))
    {
      residuals <- y - x %*% zz$coefficients
      if (!any(is.na(residuals)))
        sor <- order(abs(residuals)) # get order of minimum residuals

      ### store residuals ###
      Residuals[ ,m-p+1] <- residuals

      ### store estimated coefficients ###
      Coefficients[m-p+1, ] <- zz$coefficients

      ### store MAE ### MAE - Mean Absolute Error
      mae = zz$MAE
      MAE[m-p+1] <- mae # 'mae' is temp MAE

      ### store R^2 ### -> trocar por R2 da regL1
      # R2[m-p+1] <- mae # 1-(SSR/sum((yb - mean(yb))^2))

      ## If mAm is invertable then we compute the t statistics and ##
      ## distances. ##

      mAm <- t(xb) %*% xb
      # mmx <- try(solve(mAm)) # does not work
      if (det(mAm, tol=1e-20) == 0)
      { mmx <- "singular matrix"
      class(mmx) <- "error" }
      else
        mmx <- solve(mAm, tol=1e-20)

      if (!inherits(mmx,"try-error") && (m > p))
      {
        ### store t statistics ### extracted from summary

        # trying get beta's estimates
        beta_try = try(
          suppressWarnings({
            as.numeric(summary(zz, se = 'iid')$coefficients[,3])
          })
          , silent = TRUE)
        if(inherits(beta_try, "try-error")){
          beta_try = rep(0,p)
        }

        tStatistics[m-p, ] <- beta_try

      }

      if (!inherits(mmx,"try-error") && (m < n))
      {
        # bsb.comp = o que não está em bsb
        # bsb.comp = complement of bsb #used to be ncl
        bsb.comp <- setdiff(n1, bsb)
        # find minimum deletion residual
        ord <- numeric(length(bsb.comp))

        #hii is leverage!
        # tenho que refazer considerando o reajuste L1
        for (i in 1:length(bsb.comp))
        {
          #hii <- x[bsb.comp[i], , drop = FALSE] %*% mmx %*% t(x[bsb.comp[i], , drop = FALSE])
          #ord[i] <- residuals[bsb.comp[i]]^2 / (1+hii)
          ord[i] <- residuals[bsb.comp[i]]
        }

        #somente dividi o resíduo absoluto pelo MAE ()
        if (!is.na(MAE[m-p+1]) && MAE[m-p+1] > 0)
          MinDelRes[m-p, 1] <- sqrt(min(abs(ord)) / MAE[m-p+1])
      }

    } #end of zz was fit if block
    else
    {
      if (m == p)
        Residuals[ ,m-p+1] <- y - x %*% lms$coef
      else
        Residuals[ ,m-p+1] <- Residuals[ ,m-p]
    }

  } #end of for loop

  Residuals <- Residuals / mae

  #### names
  dimnames(Unit) <- list(paste("m=", p:n, sep=""), paste("Inclusion", 1:5))
  names(included) <- paste("m=", p:n, sep = "")
  dimnames(Coefficients) <- list(paste("m=", p:n, sep=""), coef.names)
  dimnames(tStatistics) <- list(paste("m=", (p+1):n, sep=""),  coef.names)
  dimnames(Residuals) <- list(n1, paste("m=", p:n, sep=""))
  #dimnames(R2) <- list(p:n, c("R^2"))
  dimnames(MinDelRes) <- list((p+1):(n-1), "Min del. res.")

  #excluding lines of zeros in tStatistics # NULL values
  soma_abs <- rowSums(abs(tStatistics))
  tStatistics <- tStatistics[soma_abs != 0, ]
  number_excluded_lines = sum(soma_abs == 0)

  ans <- list(call = match.call(),
              Residuals = Residuals,
              Unit = Unit,
              included = included,
              Coefficients = Coefficients,
              tStatistics = tStatistics,
              MAE = MAE,
              #R2 = R2,
              MinDelRes = MinDelRes,
              StartingModel = lms)

  class(ans) <- "forwardSearch_regL1"
  ans
}