R/blockwise_coordinate_descent.R

Defines functions lambda_max.coordinate_descent scale_manual rescale_without_NA_block mean_without_NA sd_without_NA_block change_NA_value_block scale_manual_with_sd cross_validation_function.block_descent blockwise_coordinate_descent

Documented in blockwise_coordinate_descent

lambda_max.coordinate_descent <- function(Z,
                                          y,
                                          n,
                                          p,
                                          p1,
                                          p2,
                                          ratio_matrix=NULL,
                                          noise=c("additive","missing")){
  start <- p1 + 1
  X1 <- Z[,1:p1]
  Z2 <- Z[,start:p]
  if (noise=="additive"){
    rho_tilde <- 1/n*t(Z)%*%y
    lambda <- max(abs(rho_tilde))
  }else{
    rho_tilde1 <- 1/n*t(X1)%*%y
    rho_tilde2 <- 1/n*t(Z2)%*%y/ diag(ratio_matrix)
    lambda = max(max(abs(rho_tilde1)),max(abs(rho_tilde2)))
  }
  lambda
}


### FUNCTIONS USED TO PERFORM SCALING OF THE CORRUPTED MATRIX WHEN THERE IS MISSING DATA

scale_manual <- function(j,Z){
  sd <- stats::sd(Z[,j])
  if (sd != 0){
    return(Z[,j]/sd)
  }else{
    return (Z[,j])
  }
  
}

rescale_without_NA_block <- function(j,Z,p1){
  m <- mean(Z[which(!is.na(Z[,p1 + j]), arr.ind = TRUE),p1+j])
  Z[,p1 + j] - m
}

mean_without_NA <- function(j,Z){
  m <- mean(Z[which(!is.na(Z[,j]), arr.ind = TRUE),j])
  m
}

sd_without_NA_block <- function(j,Z){
  sd <- stats::sd(Z[which(!is.na(Z[,j]), arr.ind = TRUE),j])
  sd
}

change_NA_value_block <- function(j,Z,p1){
  Z[which(is.na(Z[,p1 + j]), arr.ind = TRUE),j + p1] <- 0
  Z[,p1 + j]
}

scale_manual_with_sd <- function(j,Z,v){
  sd <- v[j]
  if (sd != 0){
    return(Z[,j]/sd)
  }else{
    return (Z[,j])
  }
}

###CROSS VALIDATION FUNCTION USED IN THE BLOCKWISE COORDINATE DESCENT

cross_validation_function.block_descent <- function(k,
                                                    Z,
                                                    y,
                                                    n,
                                                    n_one_fold,
                                                    n_without_fold,
                                                    p,
                                                    p1,
                                                    p2,
                                                    folds,
                                                    lambda,
                                                    list_PSD_lasso,
                                                    list_sigma_lasso,
                                                    list_PSD_error,
                                                    list_sigma_error,
                                                    ratio_matrix=NULL,
                                                    beta1.start,
                                                    beta2.start,
                                                    noise,
                                                    penalty=penalty){
  
  ### Calculating the error for the design matrix without the kth fold
  start = p1 + 1
  #Solving the lasso problem without the kth fold
  sigma_corrupted_train <- list_PSD_lasso[[k]]
  sigma_uncorrupted_train <- list_sigma_lasso[[k]] 
  index <- which(folds==k,arr.ind = TRUE)
  X1_cv_train <- Z[-index,1:p1]
  Z2_cv_train <- Z[-index,start:p]
  y_cv_train <- y[-index]
  out = lasso_covariance_block(n=n_without_fold,p1=p1,p2=p2,X1=X1_cv_train,Z2=Z2_cv_train,y=y_cv_train,
                               sigma1=sigma_uncorrupted_train,sigma2=sigma_corrupted_train,lambda=lambda,
                               noise=noise,ratio_matrix = ratio_matrix,beta1.start = beta1.start, beta2.start = beta2.start,
                               penalty=penalty)
  beta1.lambda <- out$coefficients.beta1
  beta2.lambda <- out$coefficients.beta2
  
  #Calculating the error on the remaining fold
  sigma_corrupted_test <- list_PSD_error[[k]]
  sigma_uncorrupted_test <- list_sigma_error[[k]]
  X1_cv_test <- Z[index,1:p1]
  Z2_cv_test <- Z[index,start:p]
  y_cv_test <- y[index]
  
  rho_1 <- 1/n_one_fold * t(X1_cv_test) %*% y_cv_test
  if (noise == "additive"){
    rho_2 <- 1/ n_one_fold * t(Z2_cv_test) %*% y_cv_test 
    sigma3 <- 1/ n_one_fold * t(Z2_cv_test) %*% X1_cv_test %*% beta1.lambda
  }else{
    Z2_tilde <- sapply(1:p2,function(j)Z2_cv_test[,j] / diag(ratio_matrix)[j])
    rho_2 <- 1/ n_one_fold * (t(Z2_tilde) %*% y_cv_test) 
    sigma3 <- 1/ n_one_fold * (t(Z2_tilde) %*% X1_cv_test %*% beta1.lambda) 
  }
  error <- t(beta1.lambda)%*%sigma_uncorrupted_test%*%beta1.lambda + t(beta2.lambda)%*%sigma_corrupted_test%*%beta2.lambda - 2*t(rho_1)%*%beta1.lambda - 2*t(rho_2)%*%beta2.lambda + 2*t(beta2.lambda)%*%sigma3
  
  error
}

#' Blockwise coordinate descent
#'
#' Implement blockwise coordinate descent algorithm. Best penalty value is evaluated through cross-validation.
#'
#' @param Z Corrupted design matrix (with additive error or missing data)
#' @param y Response vector
#' @param n Number of samples of the design matrix
#' @param p Number of features of the matrix
#' @param p1 Number of uncorrupted predictors
#' @param p2 Number of corrupted predictors
#' @param center.Z If TRUE, centers Z matrix without taking into account NAs values, and then change NAs to 0 value (in the
#' missing data setting).
#' @param scale.Z If TRUE, divides Z columns by their standard deviation
#' @param center.y If TRUE, centers y
#' @param scale.y If TRUE, divides y by its standard deviation
#' @param lambda.factor Range of the lambda interval we are going to explore
#' @param step Number of values of lambda in the interval we are going to test
#' @param K Number of folds for the cross-validation
#' @param mu Penalty parameter for the ADMM algorithm
#' @param tau Standard deviation for the additive error matrix in the additive error setting (NULL in the missing data setting)
#' @param etol Tolerance parameter for the ADMM algorithm
#' @param optTol Tolerance parameter for the convergence of the error in the pathwise coordinate descent
#' @param earlyStopping_max Number of iterations allowed when error starts increasing
#' @param noise Type of noise (additive or missing)
#' @param penalty Type of penalty used : can be lasso penalty or SCAD penalty
#' @param mode ADMM or HM
#' 
#' @return list containing \itemize{
#' \item \code{lambda.opt} optimal value of lambda corresponding to minimum error
#' \item \code{lambda.sd} Value of lambda corresponding to error higher than minimum error by one standard deviation
#' \item \code{beta.opt} Value of beta corresponding to \code{lambda.opt}
#' \item \code{beta.sd} Value of beta corresponding to \code{lambda.sd}
#' \item \code{data_error} Dataframe containing errors and their standard deviation for each iteration of the algorithm
#' \item \code{data_beta} Dataframe containing the values of beta for each iteration of the algorithm
#' \item \code{earlyStopping} Integer containing the value of iteration when early stopping happens
#' \item \code{vnames} Column names of Z matrix
#' \item \code{mean.Z} Mean of Z matrix without the NAs values
#' \item \code{sd.Z} Standard deviation of Z matrix without the NAs values
#' \item \code{mean.y} Mean of y matrix
#' \item \code{sd.y} Standard deviation of y matrix
#' }
#' 
#' @details It is highly recommended to use center.Z = TRUE for the algorithm to work in the case of missing data. Otherwise 
#' the algorithm will not work the way it is intended.
#' It is recommended to use center.Z = TRUE, scale.Z = TRUE, center.y = TRUE and scale.y = TRUE for both convergence
#' and interpretability reasons. The use of center.Z = TRUE in the additive error setting can be subject to discussion,
#' as it may introduce bias in the algorithm.
#' For computing speed reasons, if model is not converging or running slow, consider changing \code{mu}, decreasing
#' \code{etol} or \code{optTol} or decreasing \code{earlyStopping_max}
#' 
#' @seealso \url{https://arxiv.org/pdf/1510.07123.pdf}
#' 
#' @export
#' 

blockwise_coordinate_descent <- function(Z,
                                         y,
                                         n,
                                         p,
                                         p1,
                                         p2,
                                         center.Z = TRUE,
                                         scale.Z = TRUE,
                                         center.y = TRUE,
                                         scale.y = TRUE,
                                         lambda.factor=ifelse(dim(Z)[1] < dim(Z)[2],0.01,0.001),
                                         step=100,
                                         K=4,
                                         mu=10,
                                         tau = NULL,
                                         etol= 1e-4,
                                         optTol = 1e-5,
                                         earlyStopping_max = 10,
                                         noise=c("additive","missing"),
                                         penalty=c("lasso","SCAD"),
                                         mode="ADMM"){
  
  nrows = nrow(Z)
  ncols = ncol(Z)
  vnames = colnames(Z)
  
  if(!(is.matrix(Z))){
    stop("Z has to be a matrix")
  }
  if(!(is.matrix(y))){
    stop("y has to be a matrix")
  }
  if(!is.null(tau) & !is.numeric(tau)){
    stop("tau must be numeric")
  }
  if(n != nrows){
    stop(paste("Number of rows in Z (", nrows, ") different from n(", n, ")"),sep="")
  }
  if(p != ncols){
    stop(paste("Number of columns in Z (", ncols, ") different from p (", p, ")"),sep="")
  }
  if (nrows != dim(y)[1]){
    stop(paste("Number of rows in Z (", nrows, ") different from number of rows in y (", dim(y)[1], ") "),sep="")
  }
  if (!is.numeric(y)) {
    stop("The response y must be numeric. Factors must be converted to numeric")
  }
  if(lambda.factor >= 1){
    stop("lambda factor should be smaller than 1")
  }
  if(p1 + p2 != p){
    stop(paste("Sum of p1 and p2 (", p1 + p2, ") should be equal to p (", p, ")"),sep="")
  }
  if(mu>500 || mu<1){
    warning(paste("Mu value (", mu, ") is not in the usual range (10-500)"))
  }
  if (n %% K != 0){
    stop("K should be a divider of n")
  }
  if(noise=="missing" && center.Z == FALSE){
    stop("When noise is equal to missing, it is required to center matrix Z. Use center.Z=TRUE.")
  }
  if(scale.Z == FALSE && noise=='missing'){
    warning("When noise is equal to missing, it is recommended to use scale.Z equal to TRUE in order 
            to obtain trustworthy results.")
  }
  if(scale.Z == TRUE && noise=='additive'){
    warning("When noise is equal to additive, it is recommended to use scale.Z equal to FALSE in order 
            to obtain trustworthy results. Otherwise, the scaling should be taken into account
            when introducing the error parameter as a function parameter.")
  }
  
  #General variables we are going to use in the function
  start = 1 + p1
  n_without_fold = n - floor(n/K)
  n_one_fold = floor(n/K)
  earlyStopping = step
  
  ratio_matrix = NULL
  if (noise=="missing"){
    ratio_matrix = matrix(0,p2,p2)
    
    for (i in 1:p2){
      for (j in i:p2){
        n_ij = length(intersect(which(!is.na(Z[,p1 + i])),which(!is.na(Z[,p1 + j]))))
        ratio_matrix[i,j] = n_ij
        ratio_matrix[j,i] = n_ij
      }
    }
    ratio_matrix = ratio_matrix/n
  }
  
  mean.Z = sapply(1:p, function(j)mean_without_NA(j,Z))
  sd.Z <- sapply(1:p, function(j)sd_without_NA_block(j,Z))
  

  
  if (center.Z == TRUE){
    if (scale.Z == TRUE){
      Z[,start:p] = sapply(1:p2, function(j)rescale_without_NA_block(j,Z,p1))
      Z[,start:p] = sapply(1:p2, function(j)change_NA_value_block(j,Z,p1))
      Z[,1:p1] = scale(Z[,1:p1], center = TRUE, scale = FALSE)
      Z = sapply(1:p, function(j)scale_manual_with_sd(j,Z,sd.Z))
    }else{
      Z[,start:p] = sapply(1:p2, function(j)rescale_without_NA_block(j,Z,p1))
      Z[,start:p] = sapply(1:p2, function(j)change_NA_value_block(j,Z,p1))
      Z[,1:p1] = scale(Z[,1:p1], center = TRUE, scale = FALSE)
    }
  }else{
    if (scale.Z == TRUE){
      Z = sapply(1:p, function(j)scale_manual_with_sd(j,Z,sd.Z))
    }
  }
  
  mean.y = mean(y)
  sd.y = stats::sd(y)
  
  if (center.y == TRUE){
    if (scale.y == TRUE){
      y = scale(y, center = TRUE, scale = TRUE)
    }else{
      y = scale(y, center = TRUE, scale = FALSE)
    }
  }
  
  lambda_max <- lambda_max.coordinate_descent(Z=Z,y=y,n=n,p=p,p1=p1,p2=p2,ratio_matrix=ratio_matrix,noise=noise)
  lambda_min <- lambda.factor*lambda_max
  lambda_list <- emdbook::lseq(lambda_max,lambda_min,step)
  beta1.start <- rep(0,p1)
  beta2.start <- rep(0,p2)
  beta.start <- c(beta1.start,beta2.start)
  best.lambda <- lambda_max
  beta.opt <- beta.start
  best.error <- 10000
  error_list <- matrix(0,step,4)
  error <- 0
  earlyStopping_high = 0
  
  matrix_beta <- matrix(0,step,p)
  
  ### Creating the K matrices we are going to use for cross validation
  output = cv_covariance_matrices_block_descent(K=K, mat=Z, y=y, p=p, p1=p1, p2=p2, mu=mu, 
                                                tau=tau, ratio_matrix = ratio_matrix, etol=etol, noise = noise, mode=mode)
  list_PSD_lasso = output$list_PSD_lasso
  list_PSD_error = output$list_PSD_error
  list_sigma_lasso = output$list_sigma_lasso
  list_sigma_error = output$list_sigma_error
  sigma1 = output$sigma_global_uncorrupted
  sigma2 = output$sigma_global_corrupted
  folds = output$folds
  X1 = Z[,1:p1]
  Z2 = Z[,start:p]
  
  for (i in 1:step){
    
    lambda_step <- lambda_list[i]
    error_old <- error
    
    out = sapply(1:K, function(k)cross_validation_function.block_descent(k,
                                                                         Z,
                                                                         y,
                                                                         n,
                                                                         n_one_fold,
                                                                         n_without_fold,
                                                                         p,
                                                                         p1,
                                                                         p2,
                                                                         folds,
                                                                         lambda_step,
                                                                         list_PSD_lasso,
                                                                         list_sigma_lasso,
                                                                         list_PSD_error,
                                                                         list_sigma_error,
                                                                         ratio_matrix = ratio_matrix,
                                                                         beta1.start,
                                                                         beta2.start,
                                                                         noise=noise,
                                                                         penalty=penalty))
    error = mean(out)
    sd_low = stats::quantile(out, probs = c(0.1))
    sd_high = stats::quantile(out, probs = c(0.9))
    error_list[i,1] <- error
    error_list[i,2] <- sd_low
    error_list[i,3] <- sd_high
    error_list[i,4] <- stats::sd(out)
    out = lasso_covariance_block(n=n, p1=p1, p2=p2, X1=X1, Z2=Z2, y=y, sigma1=sigma1, sigma2=sigma2, lambda=lambda_step, 
                                 noise=noise, ratio_matrix = ratio_matrix, beta1.start = beta1.start, beta2.start = beta2.start,
                                 penalty=penalty)
    beta1 <- out$coefficients.beta1
    beta2 <- out$coefficients.beta2
    beta <- c(beta1,beta2)
    
    beta1.start <- beta1
    beta2.start <- beta2
    matrix_beta[i,] <- beta
    
    ### Checking for optimal parameters
    if (error <= best.error){
      best.error <- error
      best.lambda <- lambda_step
      beta.opt <- beta
    }
    
    ## Early stopping
    if (abs(error - error_old) < optTol){
      print("Early Stopping because of convergence of the error")
      earlyStopping = i
      break
    }
    
    # Trying to avoid getting to too high lambda values, leading to too time consuming steps
    if (i>= step/2 && error >= error_list[1]){
      print("Value of lambda yielding too high error : we exit the loop")
      earlyStopping = i
      break
    }
    if (error > best.error){
      earlyStopping_high = earlyStopping_high +1
      if (earlyStopping_high >= earlyStopping_max){
        print("Early stopping because of error getting too high")
        earlyStopping = i
        break
      }
    }
  }
  df <- data.frame(lambda=lambda_list[1:earlyStopping], error=error_list[1:earlyStopping,1],error.inf=error_list[1:earlyStopping,2],error.sup=error_list[1:earlyStopping,3],error.sd=error_list[1:earlyStopping,4])
  step.min <- which(df[,"error"] == best.error)
  sd.best <- df[step.min,"error.sd"]
  step.sd <- max(which(df[,"error"] > best.error + sd.best & df[,"lambda"] > df[step.min,"lambda"]))
  lambda.sd <- df[step.sd,"lambda"]
  beta.sd <- matrix_beta[step.sd,]
  
  
  data_intermediate <- data.frame(matrix_beta[1:earlyStopping,])
  names(data_intermediate) <- sapply(1:p, function(i)paste0("beta",i))
  
  data_beta <- data.frame(lambda = lambda_list[1:earlyStopping])
  
  data_beta <- cbind(data_beta,data_intermediate)
  
  fit = list(
    lambda.opt = best.lambda,
    lambda.sd = lambda.sd,
    beta.opt = beta.opt,
    beta.sd = beta.sd,
    data_error = df,
    data_beta = data_beta,
    earlyStopping = earlyStopping,
    vnames = vnames,
    mean.Z = mean.Z,
    sd.Z = sd.Z,
    mean.y = mean.y,
    sd.y = sd.y                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
  )
  return(fit)
}
celiaescribe/BDcocolasso documentation built on Feb. 11, 2020, 11:41 p.m.