R/blockwise_coordinate_descent_general.R

Defines functions lambda_max.coordinate_descent.general scale_manual rescale_without_NA_block mean_without_NA sd_without_NA_block change_NA_value_block scale_manual_with_sd cross_validation_function.block_descent_general blockwise_coordinate_descent_general

Documented in blockwise_coordinate_descent_general

lambda_max.coordinate_descent.general <- function(Z,
                                          y,
                                          n,
                                          p,
                                          p1,
                                          p2,
                                          p3,
                                          ratio_matrix=NULL){
  start <- p1 + p2 + 1
  X1 <- Z[,1:(p1+p2)]
  Z2 <- Z[,start:p]
  rho_tilde1 <- 1/n*t(X1)%*%y
  rho_tilde2 <- 1/n*t(Z2)%*%y/ diag(ratio_matrix)
  lambda = max(max(abs(rho_tilde1)),max(abs(rho_tilde2)))
  lambda
}


### FUNCTIONS USED TO PERFORM SCALING OF THE CORRUPTED MATRIX WHEN THERE IS MISSING DATA

scale_manual <- function(j,Z){
  sd <- stats::sd(Z[,j])
  if (sd != 0){
    return(Z[,j]/sd)
  }else{
    return (Z[,j])
  }
  
}

rescale_without_NA_block <- function(j,Z,p1){
  m <- mean(Z[which(!is.na(Z[,p1 + j]), arr.ind = TRUE),p1+j])
  Z[,p1 + j] - m
}

mean_without_NA <- function(j,Z){
  m <- mean(Z[which(!is.na(Z[,j]), arr.ind = TRUE),j])
  m
}

sd_without_NA_block <- function(j,Z){
  sd <- stats::sd(Z[which(!is.na(Z[,j]), arr.ind = TRUE),j])
  sd
}

change_NA_value_block <- function(j,Z,p1){
  Z[which(is.na(Z[,p1 + j]), arr.ind = TRUE),j + p1] <- 0
  Z[,p1 + j]
}

scale_manual_with_sd <- function(j,Z,v){
  sd <- v[j]
  if (sd != 0){
    return(Z[,j]/sd)
  }else{
    return (Z[,j])
  }
}

###CROSS VALIDATION FUNCTION USED IN THE BLOCKWISE COORDINATE DESCENT

cross_validation_function.block_descent_general <- function(k,
                                                    Z,
                                                    y,
                                                    n,
                                                    n_one_fold,
                                                    n_without_fold,
                                                    p,
                                                    p1,
                                                    p2,
                                                    p3,
                                                    folds,
                                                    lambda,
                                                    list_PSD_lasso_additive,
                                                    list_PSD_lasso_missing,
                                                    list_sigma_lasso,
                                                    list_PSD_error_additive,
                                                    list_PSD_error_missing,
                                                    list_sigma_error,
                                                    ratio_matrix=NULL,
                                                    beta1.start,
                                                    beta2.start,
                                                    beta3.start,
                                                    penalty=penalty){
  
  ### Calculating the error for the design matrix without the kth fold
  start = p1 + 1
  #Solving the lasso problem without the kth fold
  if (p1 != 0) {
    sigma_uncorrupted_train <- list_sigma_lasso[[k]] 
  }
  sigma_corrupted_train_additive <- list_PSD_lasso_additive[[k]]
  sigma_corrupted_train_missing <- list_PSD_lasso_missing[[k]]
  
  index <- which(folds==k,arr.ind = TRUE)
  X1_cv_train <- Z[-index,1:p1]
  Z2_cv_train <- Z[-index,start:(p1+p2)]
  Z3_cv_train <- Z[-index,(p1+p2+1):p]
  y_cv_train <- y[-index]
  out = lasso_covariance_block_general(n=n_without_fold,p1=p1,p2=p2,p3=p3,X1=X1_cv_train,Z2=Z2_cv_train,Z3=Z3_cv_train,y=y_cv_train,
                               sigma1=sigma_uncorrupted_train,sigma2=sigma_corrupted_train_additive,sigma3=sigma_corrupted_train_missing,lambda=lambda,
                               ratio_matrix = ratio_matrix,beta1.start = beta1.start, beta2.start = beta2.start, beta3.start = beta3.start,
                               penalty=penalty)
  if (p1 != 0) {
    beta1.lambda <- out$coefficients.beta1
  }
  beta2.lambda <- out$coefficients.beta2
  beta3.lambda <- out$coefficients.beta3
  
  #Calculating the error on the remaining fold
  if (p1 != 0) {
    sigma_uncorrupted_test <- list_sigma_error[[k]]
    sigma_corrupted_test_additive <- list_PSD_error_additive[[k]]
    sigma_corrupted_test_missing <- list_PSD_error_missing[[k]]
    X1_cv_test <- Z[index,1:p1]
    Z2_cv_test <- Z[index,start:(p1+p2)]
    Z3_cv_test <- Z[index,(p1+p2+1):p]
    y_cv_test <- y[index]
    
    rho_1 <- 1/n_one_fold * t(X1_cv_test) %*% y_cv_test
    rho_2 <- 1/n_one_fold * t(Z2_cv_test) %*% y_cv_test
    Z3_tilde <- sapply(1:p3,function(j)Z3_cv_test[,j] / diag(ratio_matrix)[j])
    rho_3 <- 1/n_one_fold * t(Z3_tilde) %*% y_cv_test
    
    sigma21 <- 1/n_one_fold * t(Z2_cv_test) %*% X1_cv_test %*% beta1.lambda
    sigma31 <- 1/n_one_fold * t(Z3_tilde) %*% X1_cv_test %*% beta1.lambda
    sigma32 <- 1/n_one_fold * t(Z3_tilde) %*% Z2_cv_test %*% beta2.lambda
    
    error <- t(beta1.lambda)%*%sigma_uncorrupted_test%*%beta1.lambda + 
      t(beta2.lambda)%*%sigma_corrupted_test_additive%*%beta2.lambda +
      t(beta3.lambda)%*%sigma_corrupted_test_missing%*%beta3.lambda - 
      2*t(rho_1)%*%beta1.lambda - 
      2*t(rho_2)%*%beta2.lambda -
      2*t(rho_3)%*%beta3.lambda + 
      2*t(beta2.lambda)%*%sigma21 +
      2*t(beta3.lambda)%*%sigma31 +
      2*t(beta3.lambda)%*%sigma32
    
    return(error)
  }
  if (p1 == 0) {
    sigma_corrupted_test_additive <- list_PSD_error_additive[[k]]
    sigma_corrupted_test_missing <- list_PSD_error_missing[[k]]
    Z2_cv_test <- Z[index,start:(p1+p2)]
    Z3_cv_test <- Z[index,(p1+p2+1):p]
    y_cv_test <- y[index]
    
    rho_2 <- 1/n_one_fold * t(Z2_cv_test) %*% y_cv_test
    Z3_tilde <- sapply(1:p3,function(j)Z3_cv_test[,j] / diag(ratio_matrix)[j])
    rho_3 <- 1/n_one_fold * t(Z3_tilde) %*% y_cv_test
    
    sigma32 <- 1/n_one_fold * t(Z3_tilde) %*% Z2_cv_test %*% beta2.lambda
    
    error <- t(beta2.lambda)%*%sigma_corrupted_test_additive%*%beta2.lambda +
      t(beta3.lambda)%*%sigma_corrupted_test_missing%*%beta3.lambda - 
      2*t(rho_2)%*%beta2.lambda -
      2*t(rho_3)%*%beta3.lambda + 
      2*t(beta3.lambda)%*%sigma32
    
    return(error)
  }
}

#' Blockwise coordinate descent
#'
#' Implement blockwise coordinate descent algorithm. Best penalty value is evaluated through cross-validation.
#'
#' @param Z Corrupted design matrix (with additive error or missing data)
#' @param y Response vector
#' @param n Number of samples of the design matrix
#' @param p Number of features of the matrix
#' @param p1 Number of uncorrupted predictors
#' @param p2 Number of corrupted predictors containing additive error
#' @param p3 Number of corrupted predictors containing missingness
#' @param center.Z If TRUE, centers Z matrix without taking into account NAs values, and then change NAs to 0 value (in the
#' missing data setting).
#' @param scale.Z If TRUE, divides Z columns by their standard deviation
#' @param center.y If TRUE, centers y
#' @param scale.y If TRUE, divides y by its standard deviation
#' @param lambda.factor Range of the lambda interval we are going to explore
#' @param step Number of values of lambda in the interval we are going to test
#' @param K Number of folds for the cross-validation
#' @param mu Penalty parameter for the ADMM algorithm
#' @param tau Standard deviation for the additive error matrix in the additive error setting (NULL in the missing data setting)
#' @param etol Tolerance parameter for the ADMM algorithm
#' @param optTol Tolerance parameter for the convergence of the error in the pathwise coordinate descent
#' @param earlyStopping_max Number of iterations allowed when error starts increasing
#' @param penalty Type of penalty used : can be lasso penalty or SCAD penalty
#' @param mode ADMM or HM
#' 
#' @return list containing \itemize{
#' \item \code{lambda.opt} optimal value of lambda corresponding to minimum error
#' \item \code{lambda.sd} Value of lambda corresponding to error higher than minimum error by one standard deviation
#' \item \code{beta.opt} Value of beta corresponding to \code{lambda.opt}
#' \item \code{beta.sd} Value of beta corresponding to \code{lambda.sd}
#' \item \code{data_error} Dataframe containing errors and their standard deviation for each iteration of the algorithm
#' \item \code{data_beta} Dataframe containing the values of beta for each iteration of the algorithm
#' \item \code{earlyStopping} Integer containing the value of iteration when early stopping happens
#' \item \code{vnames} Column names of Z matrix
#' \item \code{mean.Z} Mean of Z matrix without the NAs values
#' \item \code{sd.Z} Standard deviation of Z matrix without the NAs values
#' \item \code{mean.y} Mean of y matrix
#' \item \code{sd.y} Standard deviation of y matrix
#' }
#' 
#' @details It is highly recommended to use center.Z = TRUE for the algorithm to work in the case of missing data. Otherwise 
#' the algorithm will not work the way it is intended.
#' It is recommended to use center.Z = TRUE, scale.Z = TRUE, center.y = TRUE and scale.y = TRUE for both convergence
#' and interpretability reasons. The use of center.Z = TRUE in the additive error setting can be subject to discussion,
#' as it may introduce bias in the algorithm.
#' For computing speed reasons, if model is not converging or running slow, consider changing \code{mu}, decreasing
#' \code{etol} or \code{optTol} or decreasing \code{earlyStopping_max}
#' 
#' @seealso \url{https://arxiv.org/pdf/1510.07123.pdf}
#' 
#' @export
#' 

blockwise_coordinate_descent_general <- function(Z,
                                         y,
                                         n,
                                         p,
                                         p1,
                                         p2,
                                         p3,
                                         center.Z = TRUE,
                                         scale.Z = TRUE,
                                         center.y = TRUE,
                                         scale.y = TRUE,
                                         lambda.factor=ifelse(dim(Z)[1] < dim(Z)[2],0.01,0.001),
                                         step=100,
                                         K=4,
                                         mu=10,
                                         tau = NULL,
                                         etol= 1e-4,
                                         optTol = 1e-5,
                                         earlyStopping_max = 10,
                                         penalty=c("lasso","SCAD"),
                                         mode="ADMM"){
  
  nrows = nrow(Z)
  ncols = ncol(Z)
  vnames = colnames(Z)
  
  if(!(is.matrix(Z))){
    stop("Z has to be a matrix")
  }
  if(!(is.matrix(y))){
    stop("y has to be a matrix")
  }
  if(!is.null(tau) & !is.numeric(tau)){
    stop("tau must be numeric")
  }
  if(n != nrows){
    stop(paste("Number of rows in Z (", nrows, ") different from n(", n, ")"),sep="")
  }
  if(p != ncols){
    stop(paste("Number of columns in Z (", ncols, ") different from p (", p, ")"),sep="")
  }
  if (nrows != dim(y)[1]){
    stop(paste("Number of rows in Z (", nrows, ") different from number of rows in y (", dim(y)[1], ") "),sep="")
  }
  if (!is.numeric(y)) {
    stop("The response y must be numeric. Factors must be converted to numeric")
  }
  if(lambda.factor >= 1){
    stop("lambda factor should be smaller than 1")
  }
  if(p1 + p2 + p3 != p){
    stop(paste("Sum of p1, p2 and p3 (", p1 + p2 + p3, ") should be equal to p (", p, ")"),sep="")
  }
  if(mu>500 || mu<1){
    warning(paste("Mu value (", mu, ") is not in the usual range (10-500)"))
  }
  if (n %% K != 0){
    stop("K should be a divider of n")
  }
  if(center.Z == FALSE){
    stop("When the data contain missingness, it is required to center matrix Z. Use center.Z=TRUE.")
  }
  if(scale.Z == FALSE){
    warning("When the data contain missingness, it is recommended to use scale.Z equal to TRUE in order 
            to obtain trustworthy results.")
  }
  
  #General variables we are going to use in the function
  start = 1 + p1 + p2
  n_without_fold = n - floor(n/K)
  n_one_fold = floor(n/K)
  earlyStopping = step
  
  ratio_matrix = NULL
  ratio_matrix = matrix(0,p3,p3)
    
  for (i in 1:p3){
    for (j in i:p3){
      n_ij = length(intersect(which(!is.na(Z[,p1 + p2 + i])),which(!is.na(Z[,p1 + p2 + j]))))
      ratio_matrix[i,j] = n_ij
      ratio_matrix[j,i] = n_ij
    }
  }
  ratio_matrix = ratio_matrix/n
  
  
  mean.Z = sapply(1:p, function(j)mean_without_NA(j,Z))
  sd.Z <- sapply(1:p, function(j)sd_without_NA_block(j,Z))
  
  
  
  if (center.Z == TRUE){
    if (scale.Z == TRUE){
      Z[,start:p] = sapply(1:p3, function(j)rescale_without_NA_block(j,Z,p1+p2))
      Z[,start:p] = sapply(1:p3, function(j)change_NA_value_block(j,Z,p1+p2))
      Z[,1:(p1+p2)] = scale(Z[,1:(p1+p2)], center = TRUE, scale = FALSE)
      if (p1 == 0) {
        Z[,c((p1+p2+1):p)] = sapply(1:(p - p2), function(j)scale_manual_with_sd(j,Z[,c((p1+p2+1):p)],sd.Z[c((p1+p2+1):p)]))
      }
      if (p1 != 0) {
        Z[,c(1:p1,(p1+p2+1):p)] = sapply(1:(p - p2), function(j)scale_manual_with_sd(j,Z[,c(1:p1,(p1+p2+1):p)],sd.Z[c(1:p1,(p1+p2+1):p)]))
      }
    }else{
      Z[,start:p] = sapply(1:p3, function(j)rescale_without_NA_block(j,Z,p1+p2))
      Z[,start:p] = sapply(1:p3, function(j)change_NA_value_block(j,Z,p1+p2))
      Z[,1:(p1+p2)] = scale(Z[,1:(p1+p2)], center = TRUE, scale = FALSE)
    }
  }else{
    if (scale.Z == TRUE){
      Z = sapply(1:p, function(j)scale_manual_with_sd(j,Z,sd.Z))
    }
  }
  
  mean.y = mean(y)
  sd.y = stats::sd(y)
  
  if (center.y == TRUE){
    if (scale.y == TRUE){
      y = scale(y, center = TRUE, scale = TRUE)
    }else{
      y = scale(y, center = TRUE, scale = FALSE)
    }
  }
  
  lambda_max <- lambda_max.coordinate_descent.general(Z=Z,y=y,n=n,p=p,p1=p1,p2=p2,p3=p3,ratio_matrix=ratio_matrix)
  lambda_min <- lambda.factor*lambda_max
  lambda_list <- emdbook::lseq(lambda_max,lambda_min,step)
  beta1.start <- rep(0,p1)
  beta2.start <- rep(0,p2)
  beta3.start <- rep(0,p3)
  beta.start <- c(beta1.start,beta2.start,beta3.start)
  best.lambda <- lambda_max
  beta.opt <- beta.start
  best.error <- 10000
  error_list <- matrix(0,step,4)
  error <- 0
  earlyStopping_high = 0
  
  matrix_beta <- matrix(0,step,p)
  
  ### Creating the K matrices we are going to use for cross validation
  output = cv_covariance_matrices_block_descent_general(K=K, mat=Z, y=y, p=p, p1=p1, p2=p2, p3=p3, mu=mu, 
                                                tau=tau, ratio_matrix = ratio_matrix, etol=etol, mode=mode)
  list_PSD_lasso_additive = output$list_PSD_lasso_additive
  list_PSD_error_additive = output$list_PSD_error_additive
  list_PSD_lasso_missing = output$list_PSD_lasso_missing
  list_PSD_error_missing = output$list_PSD_error_missing
  if (p1!=0) {
    list_sigma_lasso = output$list_sigma_lasso
    list_sigma_error = output$list_sigma_error
    sigma1 = output$sigma_global_uncorrupted
  }
  if (p1==0) {
    list_sigma_lasso = NULL
    list_sigma_error = NULL
    sigma1 = NULL
  }
  sigma2 = output$sigma_global_corrupted_additive
  sigma3 = output$sigma_global_corrupted_missing
  folds = output$folds
  X1 = Z[,1:p1]
  Z2 = Z[,(p1+1):(p1+p2)]
  Z3 = Z[,(p1+p2+1):p]
  
  for (i in 1:step){
    print(paste0("iteration: ",i))
    
    lambda_step <- lambda_list[i]
    error_old <- error
    
    out = sapply(1:K, function(k)cross_validation_function.block_descent_general(k,
                                                                         Z,
                                                                         y,
                                                                         n,
                                                                         n_one_fold,
                                                                         n_without_fold,
                                                                         p,
                                                                         p1,
                                                                         p2,
                                                                         p3,
                                                                         folds,
                                                                         lambda_step,
                                                                         list_PSD_lasso_additive,
                                                                         list_PSD_lasso_missing,
                                                                         list_sigma_lasso,
                                                                         list_PSD_error_additive,
                                                                         list_PSD_error_missing,
                                                                         list_sigma_error,
                                                                         ratio_matrix = ratio_matrix,
                                                                         beta1.start,
                                                                         beta2.start,
                                                                         beta3.start,
                                                                         penalty=penalty))
    error = mean(out)
    sd_low = stats::quantile(out, probs = c(0.1))
    sd_high = stats::quantile(out, probs = c(0.9))
    error_list[i,1] <- error
    error_list[i,2] <- sd_low
    error_list[i,3] <- sd_high
    error_list[i,4] <- stats::sd(out)
    out = lasso_covariance_block_general(n=n, p1=p1, p2=p2, p3=p3, X1=X1, Z2=Z2, Z3=Z3, y=y, sigma1=sigma1, sigma2=sigma2, sigma3=sigma3, lambda=lambda_step, 
                                 ratio_matrix = ratio_matrix, beta1.start = beta1.start, beta2.start = beta2.start, beta3.start = beta3.start,
                                 penalty=penalty)
    if (p1 != 0) {
      beta1 <- out$coefficients.beta1
    }
    beta2 <- out$coefficients.beta2
    beta3 <- out$coefficients.beta3
    if (p1 != 0) {
      beta <- c(beta1,beta2,beta3)
    }
    if (p1 == 0) {
      beta <- c(beta2,beta3)
    }
    
    if (p1 != 0) {
      beta1.start <- beta1
    }
    if (p1 == 0) {
      beta1.start <- rep(0,p1)
    }
    beta2.start <- beta2
    beta3.start <- beta3
    matrix_beta[i,] <- beta
    
    ### Checking for optimal parameters
    if (error <= best.error){
      best.error <- error
      best.lambda <- lambda_step
      beta.opt <- beta
    }
    
    ## Early stopping
    if (abs(error - error_old) < optTol){
      print("Early Stopping because of convergence of the error")
      earlyStopping = i
      break
    }
    
    # Trying to avoid getting to too high lambda values, leading to too time consuming steps
    if (i>= step/2 && error >= error_list[1]){
      print("Value of lambda yielding too high error : we exit the loop")
      earlyStopping = i
      break
    }
    if (error > best.error){
      earlyStopping_high = earlyStopping_high +1
      if (earlyStopping_high >= earlyStopping_max){
        print("Early stopping because of error getting too high")
        earlyStopping = i
        break
      }
    }
  }
  df <- data.frame(lambda=lambda_list[1:earlyStopping], error=error_list[1:earlyStopping,1],error.inf=error_list[1:earlyStopping,2],error.sup=error_list[1:earlyStopping,3],error.sd=error_list[1:earlyStopping,4])
  step.min <- which(df[,"error"] == best.error)
  sd.best <- df[step.min,"error.sd"]
  step.sd <- max(which(df[,"error"] > best.error + sd.best & df[,"lambda"] > df[step.min,"lambda"]))
  lambda.sd <- df[step.sd,"lambda"]
  beta.sd <- matrix_beta[step.sd,]
  
  
  data_intermediate <- data.frame(matrix_beta[1:earlyStopping,])
  names(data_intermediate) <- sapply(1:p, function(i)paste0("beta",i))
  
  data_beta <- data.frame(lambda = lambda_list[1:earlyStopping])
  
  data_beta <- cbind(data_beta,data_intermediate)
  
  fit = list(
    lambda.opt = best.lambda,
    lambda.sd = lambda.sd,
    beta.opt = beta.opt,
    beta.sd = beta.sd,
    data_error = df,
    data_beta = data_beta,
    earlyStopping = earlyStopping,
    vnames = vnames,
    mean.Z = mean.Z,
    sd.Z = sd.Z,
    mean.y = mean.y,
    sd.y = sd.y                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
  )
  return(fit)
}
celiaescribe/BDcocolasso documentation built on Feb. 11, 2020, 11:41 p.m.