sci: R package for hw2 STSCI6520

#' fit linear model with elasticnet using coordinate descent algorithm.
#'
#' @description Fit a linear model via penalized maximum likelihood. The regularization path is computed for the elasticnet penalty at a grid of values for the regularization parameter lambda, using coordinate descent algorithm.
#' @param X input matrix, of dimension nobs x nvars; each row is an observation vector.
#' @param y response variable.
#' @param alpha The elasticnet mixing parameter, with \eqn{0\le \alpha \le 1}. The penalty is defined as \deqn{(1-\alpha)/2||\beta||_2^2+\alpha ||\beta||_1} \eqn{\alpha=1} is the lasso penalty, and \eqn{\alpha=0} the ridge penalty.
#' @param maxit Maximum number of passes over the data for all lambda values; default is 10^5.
#' @param eps Convergence threshold for coordinate descent. Each inner coordinate-descent loop continues until the maximum change in the objective after any coefficient update is less than \code{eps} times the null deviance. Defaults value is 1E-7.
#' @param lambda.length The length of \code{lambda} sequence. Default is 100. The largest \code{lambda} is the smallest value which shrink all coefficients to zero. The smallest \code{lambda} is the largest \code{lambda} times 0.001. The \code{lambda} sequence is constructed from the largest \code{lambda} to the smallest \code{lambda} of length \code{lambda.length}.
#' @details The sequence of models implied by \code{lambda} is fit by coordinate descent. The objective function is \deqn{1/2 RSS/nobs + \lambda* penalty}.
#' @return \code{beta}     a list of fitted coefficients corresponding to the decreasing \code{lambda} sequence.
#'
#' \code{lambda}      the actural sequence of \code{lambda} values used. When alpha=0, the largest lambda reported does not quite give the zero coefficients reported (lambda=inf would in principle). Instead, the largest lambda for alpha=0.01 is used, and the sequence of lambda values is derived from this.
#' @export
#' @examples
#' # Gaussian
#' x=matrix(rnorm(100*20),100,20)
#' y=rnorm(100)
#' fit1=elnet_coord(x,y)
#' plotpath(fit1,x)
elnet_coord = function(X, y, alpha=1, lambda.length=100, maxit = 10000, eps = 1e-5){
  # standardization
  n = length(y)
  p = dim(X)[2]
  y_mean = mean(y)
  X_mean = apply(X, 2, mean)
  X_sd = apply(X, 2, stats::sd)
  y = scale(y, scale = FALSE)
  X = scale(X)

  # lambda
  if (alpha >0) lambda_max = max(apply(X, 2, function(x){sum(x*y)})) / (n * alpha) * 1.1
  else lambda_max = max(apply(X, 2, function(x){sum(x*y)})) / (n * 0.01) * 1.1
  lambda_min = lambda_max * 0.001
  lambda_seq = seq(from = lambda_max, to = lambda_min, length.out = lambda.length)

  k = 1
  beta_list = list()
  max_iter_logic = FALSE

  for (m in 1:lambda.length){
    if(max_iter_logic) break
    lambda = lambda_seq[m]
    if (m==1) beta = rep(0,p) else beta = beta_list[[m-1]]
    k = 1

    repeat{
      # a copy to have new update
      beta1 = beta

      for (j in 1:p){
        const = mean(y - X %*% beta1)
        r = y - X %*% beta1 - const
        term1 = sum(r*X[,j])/n + beta1[j]
        beta1[j] = S_opera(term1, lambda * alpha)/(1 + lambda * (1-alpha) )
      }

      k = k+1
      if (abs(max(beta1 - beta)) < eps) break
      if (k > maxit){
        print(paste('max iteration achieved after', m, 'lambda values'))
        max_iter_logic = TRUE
        break
      }

      # update the parameter
      beta = beta1
    }

    # store the estimation for m-th lambda value
    beta_list[[m]] = beta1
  }

  beta_list_org = sapply(beta_list, function(x){x / X_sd})

  return(list(beta = beta_list_org, lambda = lambda_seq))
}

#' leveraging methods for linear regression.
#' @description implements algorithmic leveraging for linear regression using uniform and leverage score based subsampling of rows.
#' @param x one-dimensional observations.
#' @param y response variable.
#' @param r the size of random sample.
#' @param draws number of draws of subsmaples.
#' @param seed seed number for reproducible experiments.
#' @details This algorithm approximates the linear regression coefficient in a dataset of sample size n using only a randomly selected subset of size r<<n. Selecting r samples uniformly at random often produces biased estimate, while selecting samples with probability proportional to their leverage scores largely alleviates this problem.
#' @return \code{beta_unif} estimated coefficients matrix of all draws using uniform subsampling.
#'
#' \code{beta_blev} estimated coefficients matrix of all draws using leverage score based subsampling
#' @export
#' @examples
#' x = rt(500, df=6)
#' y = -x + rnorm(500)
#' est = algo_average(x,y,r=50)
algo_average = function(x, y, r, draws=500, seed=101){
  beta_unif = array(0, dim = c(draws, 2))
  colnames(beta_unif) = c('intercept', 'slope')
  beta_blev = array(0, dim = c(draws, 2))
  colnames(beta_blev) = c('intercept', 'slope')

  l = length(x)
  hii = diag(matrix(x, nrow = l) %*% matrix(x, nrow = 1)/sum(x^2))
  hii = hii/sum(hii)

  set.seed(seed)
  for (j in 1:draws){
    sample_unif = sample(1:500, r)
    beta_unif[j,] = stats::lm(y[sample_unif] ~ x[sample_unif])$coefficients

    sample_blev = sample(1:500, r, prob = hii)
    W = 1/hii[sample_blev]
    W = W/sum(W)
    beta_blev[j,] = stats::lm(y[sample_blev] ~ x[sample_blev], weights = W)$coefficients
  }

  return(list(beta_unif = beta_unif, beta_blev = beta_blev))
}

#' Iteratively solve linear system of equations.
#' @description solve linear system of equations using Gauss-Seidal or Jacobi.
#' @param A  a numeric matrix containing the coefficients of the linear system.
#' @param b  a numeric vector giving the right-hand side of the linear system.
#' @param x0 the intial solution vector.
#' @param method a character string indicating which iterative method is to use. Either 'Gauss-Seidel' or 'Jacobi'.
#' @param parallelize Logical. only useful when \code{method == 'Jacobi'}. Should Jacobi method be implemented in parallel(TRUE) or not(default=FALSE).
#' @param cores The number of cores to use for parallel execution. Default is 1.
#' @param maxit Maximum number of passes over the data for all lambda values; default is 15000.
#' @param eps Convergence threshold. Defaults value is 1E-7.
#' @return \code{x.last}  solution vector found in the last iteration.
#'
#' \code{x.all} a list of solutions of all iterations.
#'
#' \code{time} a vector recording time spent at iterations.
#' @import foreach
#' @export
#' @examples
#' A = matrix(0, nrow = 100, ncol = 100)
#' diag(A) = rep(2, 100)
#' for (i in 1:100){
#'  for (j in 1:100){
#'   if(abs(i-j)==1) A[i,j] = -1
#'   }
#' }
#' v = rep(c(1,0), 50)
#' est = solve_ols(A = A, b = A%*%v, method = 'Gauss-Seidel')
#' print(max(abs(est$x.last - v)))
solve_ols = function(A, b, x0=rep(0,length(b)), method=c('Gauss-Seidel', 'Jacobi'),
                     maxit=15000, eps=1e-5, parallelize = FALSE, cores = 1){
  method = match.arg(method)
  if (any(diag(A) == 0)) stop("0 can't appear in diagonal entries of A")
  if (!(method %in% c('Gauss-Seidel', 'Jacobi'))) stop('select either Gauss-Seidel or Jacobi')
  p = dim(A)[1]
  a = proc.time()[3]

  if (method == 'Gauss-Seidel'){
    D = diag(diag(A))
    L = -A
    L[upper.tri(A, diag = TRUE)] = 0
    U = -A
    U[lower.tri(A, diag = TRUE)] = 0
    T1 = solve(D-L) %*% U
    print(paste("the relevent spectral norm is", max(svd(T1)$d)))
    if (max(svd(T1)$d)>=1){
      print("the relevent spectral norm is larger than  1, algorithm will not converge")
      return(NULL)
    }
    c  = solve(D-L) %*% b
    k = 1
    x.list = list()
    x.list[[1]] = T1 %*% x0 + c
    elapse.time = proc.time()[3] - a

    repeat{
      k = k+1
      x.list[[k]] = T1 %*% x.list[[k-1]] + c
      elapse.time = c(elapse.time, proc.time()[3]-a)
      if ( max(abs( x.list[[k]] - x.list[[k-1]] )) < eps) break
      if (k > maxit){
        break
        warning("max iterations reached")
      }
    }
  }

  if (method == 'Jacobi'){
    Dinv = diag(1/diag(A))
    L = -A
    L[upper.tri(A, diag = TRUE)] = 0
    U = -A
    U[lower.tri(A, diag = TRUE)] = 0
    T1 = Dinv %*% (L+U)
    print(paste("the relevent spectral norm is", max(svd(T1)$d)))
    if (max(svd(T1)$d)>=1){
      print("the relevent spectral norm is larger than 1, algorithm will not convergence")
      return(NULL)
    }

    if (parallelize){
      doParallel::registerDoParallel(cores)
      k = 1
      x.list = list()
      elapse.time = NULL
      repeat{
        xnew <- foreach::foreach(i = 1:p, .combine='c') %dopar% {
          (b[i] - sum(A[i,-i] * x0[-i]))/A[i,i]
        }
        elapse.time = c(elapse.time, proc.time()[3] - a)
        x.list[[k]] = xnew
        if ( max(abs( xnew - x0 )) < eps) break
        if (k > maxit){
          break
          warning("max iterations reached")
        }
        k = k+1
        x0 = xnew
      }
    }else{
      c  = Dinv %*% b
      k = 1
      x.list = list()
      x.list[[1]] = T1 %*% x0 + c
      elapse.time = proc.time()[3] - a

      repeat{
        k = k+1
        x.list[[k]] = T1 %*% x.list[[k-1]] + c
        elapse.time = c(elapse.time, proc.time()[3]-a)
        if ( max(abs( x.list[[k]] - x.list[[k-1]] )) < eps) break
        if (k > maxit){
          break
          warning("max iterations reached")
        }
      }
    }

  }

  return(list(x.last=rev(x.list)[[1]], x.all=x.list, time = elapse.time))
}



S_opera = function(z,r){
  sign(z) * max(c(abs(z)-r, 0))
}

#' plot coefficients from fit
#'
#' @param fit fitted object from
#' @param x original input matrix
#'
#' @export
#'
plotpath <- function(fit, x){
  est = fit
  p = dim(x)[2]
  ylim_beta = range(est$beta)
  graphics::plot(y = est$beta[1,], x = log(est$lambda), ylim = ylim_beta + c(-0.1,0.1),  col = 1, type = 'l', xlab = "Log Lambda", ylab = "Coefficients", main = "Solution path")
  for (i in 2:p){
    graphics::points(y = est$beta[i,], x = log(est$lambda), col = i, type = 'l')
  }
}