R/cv_screen.R
In SplitKnockoff: Split Knockoffs for Structural Sparsity

Documented in cv_screen

#' calculate the CV optimal beta and estimated support set
#' @description
#' cv_all calculate the CV optimal beta and estimated
#' support set in the problem 1/n |y - X beta|^2 + 1/nu |D beta - gamma|^2 + lambda |gamma|_1.
#'
#' @param X the design matrix
#' @param y the response vector
#' @param D the linear transform
#' @param option options for screening
#'
#' @return beta_hat: CV optimal beta
#' @return stat_cv: various intermedia statistics, including the estimated support sets
#' @export
#'
#' @importFrom glmnet glmnet
cv_screen <- function(X, y, D, option){
  # cv_screen calculate the CV optimal beta
  # support set in the problem
  # 1/n |y - X beta|^2 + 1/nu |D beta - gamma|^2 + lambda |gamma|_1.
  #
  # input arguments
  # X : the design matrix
  # y : the response vector
  # D : the linear transform
  #
  # output arguments
  # beta_hat: CV optimal beta
  # stat_cv: various intermedia statistics, including the estimated support sets

  k_fold = 10
  n = nrow(X)

  # appoint the set of \nu
  if (!is.null(option$nu_cv)){
    nu_s = option$nu_cv
  }else{
    nu_s = 10.^seq(2, 0, by = -0.4)
  }

  # appoint a set of lambda
  if (!is.null(option$lambda_cv)){
    lambda_s = option$lambda_cv
  }else{
    lambda_s = 10.^seq(0, -8, by = -0.4)
  }
  nlambda = length(lambda_s)

  # screen for a estimated support set of beta
  lambda_vec = 10.^seq(0, -6, by = -0.1)
  cvfit = glmnet::cv.glmnet(X, y, lambda = lambda_vec)
  index = which(cvfit$lambda == cvfit$lambda.1se)[1]
  beta_hat = cvfit$glmnet.fit$beta[, index]
  stat_cv = list()
  stat_cv$beta_supp = which(abs(beta_hat) >= 10^(-2))

  X = X[, stat_cv$beta_supp]
  D = D[, stat_cv$beta_supp]

  m = nrow(D)
  p = ncol(D)

  test_size = floor(n / k_fold)

  # randomly split data

  set.seed(1)
  rand_rank = sample(n)

  # create matrix to store result for split lasso test loss
  loss_sl = array(0, dim = c(k_fold, length(nu_s), nlambda))

  for (k in 1:k_fold){
    # generate test set
    test_index = test_size * (k-1) + (1: test_size)
    test = rand_rank[test_index]

    # training set
    X_train = X[!(1:n %in% test), ]
    y_train = y[!(1:n %in% test)]

    # test set
    X_test = X[test, ]
    y_test = y[test]

    # normalization
    n_train = nrow(X_train)
    n_test = nrow(X_test)
    X_train = X_train - matrix(rep(colMeans(X_train), n_train), nrow = n_train, ncol = p, byrow = TRUE)
    y_train = y_train - mean(y_train)
    X_test = X_test - matrix(rep(colMeans(X_test), n_test), nrow = n_test, ncol = p, byrow = TRUE)
    y_test = y_test - mean(y_test)

    # test loss for split lasso
    for (i in 1:length(nu_s)){
      nu = nu_s[i]

      # generate split LASSO design matrix
      A_beta <- rbind(X_train/sqrt(n_train),D/sqrt(nu))
      A_gamma <- rbind(matrix(0,n_train,m),-diag(m)/sqrt(nu))
      tilde_y <- rbind(matrix(y_train, ncol = 1)/sqrt(n_train),matrix(0,m,1))

      # set penalty
      penalty <- matrix(1,m+p,1)

      for (tmp in 1: p) {
        penalty[tmp, 1] = 0
      }

      # lasso path settings for glmnet

      fit = glmnet(cbind(A_beta,A_gamma) , tilde_y, lambda = lambda_s, standardize = FALSE, intercept = FALSE, penalty.factor = penalty)
      coefs <- fit$beta
      for (j in 1:length(lambda_s)){
        coef = coefs[ ,j]
        nonzeros = sum(coef != 0)
        beta = coef[1:p]
        # calculate loss
        y_sl = as.matrix(X_test) %*% as.matrix(beta)
        loss_sl[k, i, j] = norm(y_sl - y_test, type = "F")^2/test_size
        if (nonzeros > option$n_2){
          loss_sl[k, i, j] = Inf
        }
      }
    }
  }

  mean_loss_sl = colMeans(loss_sl)
  sd_loss_sl = apply(loss_sl, c(2, 3), stats::sd)

  # find minimal
  min_ind = which(mean_loss_sl <= min(mean_loss_sl + sd_loss_sl, na.rm = TRUE), arr.ind = TRUE)
  ind_r = nrow(min_ind)
  nu_number = min_ind[ind_r, 1]
  lambda_number = min_ind[ind_r, 2]
  nu_sl = nu_s[nu_number]
  lambda_sl = lambda_s[lambda_number]

  stat_cv$nu = nu_sl
  stat_cv$lambda = lambda_sl

  # calculate beta
  A_beta <- rbind(X/sqrt(n),D/sqrt(nu_sl))
  A_gamma <- rbind(matrix(0,n,m),-diag(m)/sqrt(nu_sl))
  tilde_y <- rbind(matrix(y, ncol = 1)/sqrt(n),matrix(0,m,1))

  fit = glmnet(cbind(A_beta,A_gamma) , tilde_y, lambda = lambda_sl, standardize = FALSE, intercept = FALSE, penalty.factor = penalty)
  coef <- fit$beta
  beta_hat = coef[1:p]
  gamma = coef[(p+1):(p+m)]
  stat_cv$gamma_supp = which(abs(gamma) >= 10^(-2))
  if (length(stat_cv$beta_supp) + length(stat_cv$gamma_supp) > option$n_2){
    gamma_sort = sort(abs(gamma), decreasing = TRUE)
    threshold = gamma_sort[option$n_2 - length(stat_cv$beta_supp) + 1]
    stat_cv$gamma_supp = which(abs(gamma) > threshold)
  }

  return(list(beta_hat = beta_hat, stat_cv = stat_cv))
}