R/robustlm.R
In robustlm: Robust Variable Selection with Exponential Squared Loss

Documented in robustlm

#' @title Robust variable selection with exponential squared loss
#'
#' @description \code{robustlm} carries out robust variable selection with exponential squared loss. 
#' A block coordinate gradient descent algorithm is used to minimize the loss function.
#'
#' @param x Input matrix, of dimension nobs * nvars; each row is an observation vector. Should be in matrix format.
#' @param y Response variable. Should be a numerical vector or matrix with a single column.
#' @param gamma Tuning parameter in the loss function, which controls the degree of robustness and efficiency of the regression estimators.
#' The loss function is defined as \deqn{1-exp(-t^2/\gamma).}
#' When \code{gamma} is large, the estimators are similar to the least squares estimators
#' in the extreme case. A smaller \code{gamma} would limit the influence of an outlier on the estimators, 
#' although it could also reduce the sensitivity of the estimators. If \code{gamma=NULL}, it is selected by a
#' data-driven procedure that yields both high robustness and high efficiency.
#' @param weight Weight in the penalty. The penalty is given by \deqn{n\sum_{j=1}^d\lambda_{n j}|\beta_{j}|.} \code{weight} is a vector consisting of \eqn{\lambda_{n j}}s. If \code{weight=NULL} (by default),
#' it is set to be \eqn{(log(n))/(n|\tilde{\beta}_j|),}
#' where \eqn{\tilde{\beta}} is a numeric vector, which is an
#' initial estimator of regression coefficients obtained by an MM procedure. The default value meets a BIC-type criterion (See Details).
#' @param intercept Should intercepts be fitted (TRUE) or set to zero (FALSE)
#'
#'
#' @return An object with S3 class "robustlm", which is a \code{list} with the following components:
#' \item{beta}{The regression coefficients.}
#' \item{alpha}{The intercept.}
#' \item{gamma}{The tuning parameter used in the loss.}
#' \item{weight}{The regularization parameters.}
#' \item{loss}{Value of the loss function calculated on the training set.}
#' @details \code{robustlm} solves the following optimization problem to obtain robust estimators of regression coefficients:
#' \deqn{argmin_{\beta} \sum_{i=1}^n(1-exp{-(y_i-x_i^T\beta)^2/\gamma_n})+n\sum_{i=1}^d p_{\lambda_{nj}}(|\beta_j|),}
#' where \eqn{p_{\lambda_{n j}}(|\beta_{j}|)=\lambda_{n j}|\beta_{j}|} is the adaptive LASSO penalty. Block coordinate gradient descent algorithm is used to efficiently solve the optimization problem. 
#' The tuning parameter \code{gamma} and regularization parameter \code{weight} are chosen adaptively by default, while they can be supplied by the user.
#' Specifically, the default \code{weight} meets the following BIC-type criterion: 
#' \deqn{min_{\tau_n} \sum_{i=1}^{n}[1-exp {-(Y_i-x_i^T} {\beta})^{2} / \gamma_{n}]+n \sum_{j=1}^{d} \tau_{n j}|\beta_j| /|\tilde{\beta}_{n j}|-\sum_{j=1}^{d} \log (0.5 n \tau_{n j}) \log (n).}
#' 
#'
#' @importFrom stats model.frame
#' @importFrom matrixStats rowMedians
#' @importFrom MASS rlm
#' @importFrom MASS mvrnorm
#' @importFrom stats cov
#' @importFrom stats median
#' @importFrom stats rnorm
#' @import stats
#'
#'
#' @references Xueqin Wang, Yunlu Jiang, Mian Huang & Heping Zhang (2013) Robust Variable Selection With Exponential Squared Loss, Journal of the American Statistical Association, 108:502, 632-643, DOI: 10.1080/01621459.2013.766613
#' @references Tseng, P., Yun, S. A coordinate gradient descent method for nonsmooth separable minimization. Math. Program. 117, 387-423 (2009). https://doi.org/10.1007/s10107-007-0170-0
#' 
#' @author Borui Tang, Jin Zhu, Xueqin Wang
#'
#' @examples
#' library(MASS)
#' N <- 100
#' p <- 8
#' rho <- 0.2
#' mu <- rep(0, p)
#' Sigma <- rho * outer(rep(1, p), rep(1, p)) + (1 - rho) * diag(p)
#' ind <- 1:p
#' beta <- (-1)^ind * exp(-2 * (ind - 1) / 20)
#' lambda_seq <- seq(0.05, 5, length.out = 100)
#' X <- mvrnorm(N, mu, Sigma)
#' Z <- rnorm(N, 0, 1)
#' k <- sqrt(var(X %*% beta) / (3 * var(Z)))
#' Y <- X %*% beta + drop(k) * Z
#' robustlm(X, Y)
#' @export
#'
robustlm <- function(x, y, gamma = NULL, weight = NULL,
                intercept = TRUE) {
  obs <- nrow(x)
  if (isTRUE(intercept)) {
    x <- cbind(rep(1, obs), x)
  }
  data <- x
  res <- y

  # compute beta0 (initial estimate)
  data_all <- data.frame(res, data)
  rlmb <- suppressWarnings(MASS::rlm(res ~ . - 1, data = data_all, method = "MM")) 
  beta0 <- rlmb$coefficients

  if (is.null(gamma)) {
    # choose gamma
    c1 <- data.frame(rlmb[1])$coefficients
    y_1 <- res
    x <- data
    Z <- y_1 - (x %*% (as.matrix(c1)))
    S <- 1.4826 * stats::median(abs(Z - stats::median(Z)))
    Q <- abs(Z / S)
    index <- which(Q >= 2.5)
    zeta <- function(g) {
      m <- length(index)
      n <- length(y_1)
      rho <- 1 - exp(-Z^2 / (g))
      obj <- 2 * m / n + sum(rho[-c(index)]) * 2 / n
      return(obj)
    }
    gamma_1 <- seq(0.1, 10, by = 0.2)
    a_gamma <- sapply(gamma_1, function(x) zeta(x))
    index_1 <- which(a_gamma <= 1)
    gamma_2 <- gamma_1[index_1]
    eff <- sapply(gamma_2, function(t) TT(x, y_1, t, c1))
    gamma <- gamma_2[which(eff == min(eff))]
  }

  # choose tau
  lambda <- 1
  if (is.null(weight)) {
    weight <- obs * log(obs) / (obs * abs(beta0))
  }

  # update beta
  x <- beta0

  ############# cgdsq

  alpha <- NULL
  p <- ncol(data)
  tol <- 1e-04 # Termination tolerance (for diag. scaled residual maxRr)
  g <- grad(data, res, x, gamma) # f-gradient at x
  g <- as.vector(g)
  objf <- fnc(data, res, x, gamma)
  objl1 <- adanorm(x, weight) # l1-norm
  objl1 <- as.vector(objl1)

  k <- 0 # iteration count
  cgdk <- 0 # iteration count for cgd
  lbfgsk <- 0 # iteration count for L-BFGS
  flagl <- 0

  step <- 1 # initial stepsize for coord. gradient iterations.
  stepl <- 1
  ups <- 0.5 # initial threshold for choosing J.

  ml <- 5 # L-BFGS memory size.
  S <- matrix(0, p, ml) # S stores the last ml changes in x
  Y <- matrix(0, p, ml) # Y stores the last ml changes in gradient of f.
  rho <- matrix(0, 1, ml) # rho stores the last ml   s'*y.
  kl <- 0 # number of nonempty columns in S & Y.

  # Parameters for Armijo stepsize rule
  sigma <- 0.1
  beta2 <- 0.5
  gamma1 <- 0 # parameter for dirderiv. in cgd iterations.
  # gamma2 <- 0 # parameter for dirderiv. in rank-1 Hessian accel.
  stop <- 0 # flag for termination

  while ((stop == 0) & (k < 1000)) {
    # run cgd for first 10 iterations
    if (k < 10) {
      flagl <- 1
    }

    h <- hessian(data, res, x, gamma)
    h <- sapply(h, max, 0.01)
    h <- sapply(h, min, 1e+09)

    if (flagl == 1) {
      r_dirq <- dirq(lambda, x, g, h, ups, weight) # compute cgd direction
      maxRr <- r_dirq$maxRr
      d <- r_dirq$d
      d <- as.vector(d)
      x <- as.vector(x)

      # Check for termination
      if (maxRr <= tol) {
        stop <- 1
        break
      }

      # Armijo stepsize rule ################################################
      step <- min(step / beta2, 1)
      newobjf <- fnc(data, res, x + step * d, gamma)
      dirderiv <- t(g) %*% d + gamma1 * t(h) %*% (d^2) + lambda *
        (adanorm(x + d, weight) - objl1)
      nf <- 1
      while ((newobjf - objf) / step + lambda * (adanorm(x + step *
        d, weight) - adanorm(x, weight)) / step > sigma * dirderiv) {
        step <- step * beta2
        newobjf <- fnc(data, res, x + step * d, gamma)
        nf <- nf + 1
        if (step < 1e-30) {
          # print("CGD stepsize small! Check roundoff error in newobj-obj")
          stop <- 1
          break
        }
      }

      ######################################################################
      s <- step * d # save change in x for L-BFGS.
      x <- x + s
      objf <- newobjf
      objl1 <- adanorm(x, weight)

      # Update the threshold for choosing J, based on the current stepsize.
      if (step > 0.001) {
        ups <- max(1e-04, ups / 10)
      } else {
        if (step < 1e-06) {
          ups <- min(0.9, 50 * ups)
          step <- 0.001
        }
      }
      if (((k %% 100) < 50) & (k >= 10)) {
        flagl <- 0 # run BFGS acceleration
      } else {
        flagl <- 1 # run CGD iteration
      }
      cgdk <- cgdk + 1
    } else {
      # L-BFGS acceleration iteration

      # compute sigx using the Hessian diagonals
      r_signx <- signx(lambda, x, g, h, weight)
      sigx <- r_signx$s
      t <- r_signx$t
      maxRr <- r_signx$maxRr

      if (maxRr <= tol) {
        stop <- 1
        break
      }

      asigx <- abs(sigx)
      ################################### q=asigx*g+c*sigx; #q is the obj gradient w.r.t. x_i 'far from 0'.
      q <- asigx * g + lambda * weight * sigx
      if (norm_vec(q) > tol) {
        if (kl == 0) {
          d <- -q
        } else {
          ql <- q
          for (i in 1:kl) {
            alpha[i] <- t(S[, i]) %*% ql / rho[i]
            ql <- ql - alpha[i] * Y[, i]
          }
          r <- ql * max(1e-06, rho[1] / (t(Y[, 1]) %*% Y[, 1]))
          for (i in kl:1) {
            betal <- t(Y[, i]) %*% r / rho[i]
            r <- r + S[, i] * as.vector(alpha[i] - betal)
          }
          d <- -r * asigx #|x_i|>t whenever d_i not=0

          # steepest descent safeguard to ensure convergence
          if (t(q) %*% d > -1e-20 * t(q) %*% q | norm_vec(d) >
            1e+06 * norm_vec(q)) {
            d <- -q
          }
        }

        # Armijo stepsize rule
        stepl <- 1
        newobjf <- fnc(data, res, x + stepl * d, gamma)
        # nsx = sign(x+stepl*d); psx = sign(x);
        dirderiv <- t(q) %*% d
        nfl <- 1
        while ((newobjf - objf) / stepl + lambda * (adanorm(x + stepl *
          d, weight) - adanorm(x, weight)) / stepl > sigma * dirderiv) {
          stepl <- stepl * beta2
          newobjf <- fnc(data, res, x + stepl * d, gamma)

          if (stepl < 1e-30) {
            # cat("L-BFGS stepsize small! Check roundoff error in newobj-obj.\n")
            stop <- 1
            break
          }
          nfl <- nfl + 1
        }
        ####################################################################
        s <- stepl * d
        x <- x + s
        objf <- newobjf
        objl1 <- adanorm(x, weight)

        if ((k %% 100) < 50) {
          # | stepl> 0.1
          flagl <- 0 # run L-BFGS acceleration
        } else {
          flagl <- 1 # run CGD iteration
        }
        lbfgsk <- lbfgsk + 1
      } else {
        flagl <- 1 # run CGD iteration
      }
    }

    gold <- g
    g <- grad(data, res, x, gamma)
    y <- g - gold # save change in g for L-BFGS.

    # Check to save s and y for L-BFGS.
    if (t(y) %*% y > 1e-40) {
      sy <- t(s) %*% y
      gammal <- sy / (t(y) %*% y)
      if (gammal * max(h) > 1e-10) {
        kl <- min(ml, kl + 1)
        S <- cbind(s, S)
        Y <- cbind(y, Y)
        rho <- cbind(sy, rho)
        S <- S[, 1:ml]
        Y <- Y[, 1:ml]
        rho <- rho[, 1:ml]
        rho <- matrix(rho, nrow = 1)
      }
    }

    # # Acceleration step
    # # ##########################################################
    # if ((flagl == 1) & (k%%10 == 1) & (kl > 0)) {
    #   # Use a rank-1 approx. of Hessian of f in quadratic model and minimize
    #   # with respect to all coordinates: min g'd + |b'd|^2/2 + c||x+d||_1 The
    #   # quadratic Hessian bb' is only psd, so it may have no minimum.
    #
    #   # Choose b to satisfy (bb')s = y
    #   b <- Y[, 1]/sqrt(rho[1])
    #
    #   # If there exists i with b_i=0 and |g_i|>c, then subproblem has no
    #   # optimal soln
    #
    #   sgc <- which(lambda >= abs(g))
    #   absb <- abs(b)
    #   maxb <- max(absb)
    #   absb[sgc] <- maxb
    #   if (min(absb) > maxb * 1e-06) {
    #     # d = -x + db_i*e_i, where e_i is ith unit coordinate vector, db_i
    #     # solves qb_i = min (g_i - (b'x)b_i)db_i + b_i^2 db_i^2/2 + c|db_i|,
    #     # and i is chosen to minimize qb_i .
    #
    #     b[which(abs(b) <= 0)] = 1e-20  #perturb the zero entries of b to be nonzero
    #     bt <- t(b)
    #     gb <- t(g) - as.vector(b %*% x) * bt
    #     ############################### ???
    #
    #     db <- -matrixStats::rowMedians(cbind(t(t(rep(0, p))), t(gb + lambda * t(weight)/(bt^2)),
    #                             t(gb - lambda * t(weight)/(bt^2))))
    #     qb <- gb * db + (bt * db)^2/2 + lambda * t(weight) * abs(db)
    #     minqb <- min(qb)
    #     index <- which.min(qb)
    #     d <- -x
    #     d[index] <- d[index] + db[index]
    #
    #     dirderiv <- t(g) %*% d + gamma2 * (bt %*% d)^2 + lambda * (adanorm(x +
    #                                                                          d, weight) - objl1)
    #     if (dirderiv < (1e-08) * norm_vec(d)) {
    #       # if the direction is descent, increase the iteration and check the
    #       # termination
    #       k <- k + 1
    #       # h=min(max(hessian(data, res, x, gamma),1e-12),1e12); R=-stats::median([ x' ; (g'+c)./h ;
    #       # (g'-c)./h ]); absR=abs(R); maxRr=max(h.*absR);
    #       R <- -matrixStats::rowMedians(cbind(t(x), (t(g) + lambda * t(weight)), (t(g) -
    #                                                                    lambda * t(weight))))
    #       absR <- abs(R)
    #       maxRr <- max(absR)
    #
    #       if (maxRr <= tol) {
    #         stop <- 1
    #         break
    #       }
    #
    #       # Armijo rule for acceleration step #################################
    #
    #       sp <- 1
    #       newobjf <- fnc(data, res, x + sp * d, gamma)
    #       # nsx = sign(x+sp*d); psx = sign(x);
    #       nf <- 1
    #       while ((newobjf - objf)/sp + lambda * (adanorm(x + sp * d, weight) -
    #                                              adanorm(x, weight))/sp > sigma * dirderiv) {
    #         sp <- sp * beta2
    #         newobjf <- fnc(data, res, x + sp * d, gamma)
    #         # nsx = sign(x+sp*d);
    #         nf <- nf + 1
    #
    #         if (sp < 1e-30) {
    #           cat("rank-1 accel. stepsize small!  Check roundoff error in newobj-obj.\n")
    #           stop <- 1
    #           break
    #         }
    #       }
    #     }
    #   }
    # }
    k <- k + 1
  }
  x <- as.vector(x)
  names(weight) <- NULL
  
  if (isTRUE(intercept)) {
    alpha <- x[1]
    beta <- x[-1]
  } else {
    alpha <- NULL
    beta <- x
  }
  loss <- fnc(data, res, x, gamma)
  
  #return
  result <- list(
    beta = beta,
    alpha = alpha,
    gamma = gamma,
    weight = weight,
    loss = loss
  )
  class(result) <- "robustlm"
  
  return(result)
}


TT <- function(x, y, g, c1) {
  x0 <- x
  theta <- c1
  Ahat <- matrix(0, nrow = ncol(x0), ncol = ncol(x0))
  result <- matrix(0, nrow = length(y), ncol = ncol(x0))
  a <- 0
  
  for (i in 1:length(y)) {
    a[i] <- exp(-(y[i] - crossprod(x0[i, ], theta))^2 / (g)) *
      (1 - 2 * (y[i] - crossprod(x0[i, ], theta))^2 / (g)) *
      2 / g
    
    a1 <- matrix(rep(a[i], ncol(x0)^2), nrow = ncol(x0), ncol = ncol(x0))
    Ahat <- Ahat + tcrossprod(x0[i, ], x0[i, ]) * a1
    
    result[i, ] <- as.numeric(exp(-(y[i] - crossprod(x0[i, ], theta))^2 / (g)) *
                                (2 * (y[i] - crossprod(x0[i, ], theta)) / g)) * x0[i, ]
  }
  Ahat <- Ahat / length(y)
  Sigma <- stats::cov(result)
  Vhat1 <- solve(Ahat) %*% (Sigma) %*% solve(Ahat)
  det(Vhat1)
}

norm_vec <- function(x) sqrt(sum(x^2))

fnc <- function(data, res, beta, gamma) {
  rss <- res - data %*% beta
  y <- sum(1 - exp(-rss^2 / gamma))
  return(y)
}

grad <- function(data, res, beta, gamma) {
  rss <- res - data %*% beta
  y <- t(data) %*% (exp(-rss^2 / gamma) * rss) * (-2 / gamma)
  return(y)
}

hessian <- function(data, res, beta, gamma) {
  rss <- res - data %*% beta
  h1 <- -4 * t(data)^2 %*% (exp(-rss^2 / gamma) * rss^2) / gamma^2
  h2 <- 2 * t(data)^2 %*% exp(-rss^2 / gamma) / gamma
  y <- t(h1 + h2)
  return(y)
}

adanorm <- function(x, weight) {
  y <- t(weight) %*% abs(x)
  return(y)
}

dirq <- function(c, x, g, h, ups, weight) {
  R <- -matrixStats::rowMedians(cbind(x, (g + c * weight) / h, (g - c * weight) / h)) # R=d_{H}(x)
  hR <- h * R
  Q <- -g * R - 0.5 * R * hR - c * weight * abs(x + R) + c * weight *
    abs(x)
  maxQ <- max(Q) # ||q_H||_infty
  
  maxRr <- max(abs(hR)) # maxRr=||Hd_{H}(x)||_{\infty}
  
  # indx=which(Q'<ups*sumQ)
  indx <- which(t(Q) < ups * maxQ)
  d <- t(R)
  d[indx] <- 0 # set d(i)=0 if Q(i)<ups*maxQ
  return(list(maxRr = maxRr, d = d))
}

signx <- function(c, x, g, h, weight) {
  pena <- c * weight
  R <- -matrixStats::rowMedians(cbind(x, (g + pena) / h, (g - pena) / h)) # R=d_{H}(x)
  absR <- abs(R)
  maxR <- max(absR)
  maxRr <- max(h %*% absR) # ||H*d_H(x)||_{\infty}
  if (maxRr == 0) {
    t <- 0
  } else {
    t <- -1e-04 / log(min(0.1, 0.01 * maxR))
  }
  s <- sign(x)
  s[which(t > abs(x))] <- 0
  return(list(s = s, t = t, maxRr = maxRr))
}