AFSSEN: Adaptive Function on Scalar Elastic Net

Documented in AFSSEN

#' AFSSEN
#'
#' It computes important variables and produce smooth estimates of their parameters in a function-on-scalar linear model
#' with sub-Gaussian errors and high-dimensional predictors.
#'
#' @param Y matrix. \code{N} by \code{m} matrix of pointwise observations.
#' @param X matrix. \code{N} by \code{I} design matrix. It has
#' standardized columns.
#' @param T_domain vector. Time domain for evaluation of \code{Y} and generating kernel.
#' Default is \code{T_domain = seq(0,1,m=50)}
#' @param type_kernel string. three possible choices are implemented.  \code{gaussian},
#' Defualt is \code{"exponential"}.
#' \code{exponential}, \code{sobolev}.
#' @param param_kernel scalar. Value of the characteristic smoothing parameter of the kernel.
#' It is the \eqn{\sigma} parameter of the Gaussian and the Exponential kernel, as introduced
#'  the \eqn{\sigma} parameter of  the Sobolev.
#' Defualt is \code{8}.
#' @param thres scalar. Stopping criteria: beta increment threshold
#' \deqn{
#' || \beta^{T} - \beta^(T-1) ||_{H} < thres
#' }
#' Defualt is \code{0.02}.
#' @param number_non_zeros scalar. Stopping Criteria: Kill switch; number of nonzero predictors
#' Defualt is \code{20}.
#' @param ratio_lambda_H scalar. \eqn{\lambda_{Hmax}/\lambda_{Hmin}}
#' Defualt is \code{0.01}.
#' @param number_lambda_H scalar. Generate the number of log-equally spaced \eqn{\lambda_{H}} in \eqn{[\lambda_{Hmin},\lambda_{Hmax}]}.
#' Defualt is \code{100}.
#' @param num_lambda_H_NONad scalar. Number of \eqn{\lambda_H} in non-adaptive step
#' Defualt is \code{50}.
#' @param lambda_H vector. You have option to insert directly a vector of \eqn{\lambda_H}.
#' Defualt is \code{numeric()}.
#' @param lambda_K vector. Vector of \eqn{\lambda_{K}}.
#'
#' @param early_CV binary. 0 or 1 : applying the \code{early_CV_thres} stopping criteria or not.
#' Defualt is \code{0}.
#' @param early_CV_thres scalar. Stopping Criteria: Breaking point in CV plot.
#' \deqn{
#' |CV(h-1,k)-CV(h,k)| / |CV(h-1,k)| < early_CV_thres
#' }
#' Defualt is \code{0.001}.
#' @param max_ite_nadp scalar. Stopping Criteria: Maximum iteration of coordinate descent algorithm in non-adaptive step
#' Defualt is \code{10}.
#' @param max_ite_adp scalar. Stopping Criteria: Maximum iteration of coordinate descent algorithm in adaptive step
#' Defualt is \code{30}.
#' @param max_ite_final scalar. Stopping Criteria: Maximum iteration of coordinate descent algorithm in final step
#' Defualt is \code{50}.
#' @param target_inc binary. Stopping Criteria: 0 or 1 : if target function is increased, stop
#' Defualt is \code{1}.
#' @param proportion_training_set scalar. value in (0,1), the
#' proportion for the training set for the Cross Validation in non-adaptive step
#' Defualt is \code{0.75}.
#' @param fold_ad scalar. Number of fold for using CV in adaptive steps to find optimum \eqn{\lambda_{H}} and \eqn{\lambda_{K}} and then the coefficients estimation.
#' Defualt is \code{10}.
#'
#' @return  list containing:
#'  \itemize{
#'  \item \code{beta : } matrix. final estimation of coefficients.
#'  \item \code{beta_no_adaptive : }  matrix. estimation of coefficients after non-adaptive step.
#'  \item \code{predictors : } vector. final significant predictors.
#'  \item \code{predictors_no_adaptive : } vector. significant predictors after non-adaptive step.
#'  \item \code{lambda_H_opt : } scalar. optimum \eqn{\lambda_{H}}
#'  \item \code{lambda_K_opt : } scalar. optimum \eqn{\lambda_{K}}
#'  }
#'
#' @export
#'
#' @import Rcpp MASS caret nloptr kernlab
#'
#' @examples
#' \dontrun{
#' data(simulation)
#' data(SobolevKernel)
#' time <- proc.time()
#' FLAME_estimation <- FLAME()
#' duration <- proc.time()-time
#' duration
#' names(FLAME_estimation)
#' }
#'

AFSSEN <- function(X,Y, T_domain = seq(0, 1, length = 50),
                     # X   (a N*I numerical matrix)    N = #observations     I = #all_predictors
                     # Y = Y_ful=XB+E   (a N*m matrix of pointwise evaluation!)   m = #time_points
                     type_kernel="exponential",param_kernel=8,
                     thres=0.02, number_non_zeros=20,
                     ratio_lambda_H=0.01, number_lambda_H=100, num_lambda_H_NONad=50,
                     lambda_H=numeric(), lambda_K,
                     early_CV=0, early_CV_thres=0.001, max_ite_nadp=10,
                     max_ite_adp=30, max_ite_final=50, target_inc=1,
                     proportion_training_set=0.75, verbose=FALSE, fold_ad=10)

{

    # sourceCpp('src/FLAME_functions_cpp.cpp')
    # source("R/covMaterniso.R")
    # source("R/generation_kernel.R")
    # source("R/generation_kernel.R")
    # source("R/sobolev_kernel_generation.R")
    # source("R/norm_matrix_H.R")
    # source("R/projection_basis.R")
    # source("R/Matern_kernel_generation.R")
    # source("R/generation_kernel.R")


############################################################################
#
# Part 0: Defining the function for numerically estimate norm beta in H
#
############################################################################

    estimation_norm_R <- function(lambda_H, lambda_K, omega, B, tau)
    {

        ## @A: the function we want to optimize in AFSSEN is:
        ## 1=\sum_{j=1}^{\infty} \dfrac{tau_j < \beta_j , v_j >^2 }{ tau_j \norm{\beta_j}_K + \lambda omega_j}^2


        ## @A: nloptr is an R package for nonlinear optimization.

        ## ?@A: why do you optimiza |1-1/sum| instead of |1-sum|?

        optimization_function <- function(x, lambda_H, lambda_K, omega, B, tau)
        {
            numerat <- (B^2) * (tau^2)
            denom <- ((tau+lambda_K)*x + lambda_H*omega*tau)^2
       #    tot <- 1 - 1 / (sum(numerat/denom))
            tot <- 1 - sum(numerat/denom)
            return(abs(tot))
        }

        opts= list("algorithm"= "NLOPT_LN_COBYLA", "xtol_rel"=1.0e-16)


        ## ?@A: why it ended up with semicolon (;)?
        ott_model <- nloptr(0, optimization_function, opts=opts, lb=0,
                            lambda_H=lambda_H,lambda_K=lambda_K, omega=omega, B=B, tau=tau);

        return(ott_model$solution)

    }


    Proc_Time <- function()
    {
        r=as.numeric(proc.time()[3])
        return(r)
    }

    function_computation <- estimation_norm_R
    function_time <- Proc_Time



############################################################################
#
# Part 1: kernel generation
#
############################################################################

    #m <- 50 # total number of points
    #T_domain <- seq(0, 1, length = m) # time points, length = m
    M_integ <- length(T_domain)/diff(range(T_domain))


    M <- 50

    kernel_here <- generation_kernel(type = type_kernel,parameter = param_kernel,
                                     domain = T_domain, thres = 0.99, return.derivatives = TRUE)

    eigenval <- kernel_here$eigenval
    eigenvect <- kernel_here$eigenvect
    derivatives <- kernel_here$derivatives

    Y <- projection_basis(Y, eigenvect, M_integ)


############################################################################
#
# Part 2: RUN non-adaptive version to estimte weights for adaptive step
#
############################################################################

#######################################################
#
# Part 2.1: estimation_first
#
#######################################################

    weights = rep(1, dim(X)[2])

    # First step: all the weights are set to 1
    # first run to identify the possible values of lambda
    # (from lambda_max which makes all the predictors set to 0, to the the lambda value which
    # number_non_zeros predicotors are different from 0)

    # num_lambda_H_NONad = number_lambda_H  # NONad= Non adaptive

    #num_lambda_NONad <- round(number_lambda/5) # for this first step (the most
    # computationally expensive) we can reduce the number of lambda examined.
    # Then (in the Second: Adaptive Step)
    # we will consider the whole set of lambdas to define the final estimation






#######################
#######################
#######################     Non-adaptive
#######################
#######################



        if (verbose) print('Non-adaptive step')

        subset <- c(rep(2, ceiling(dim(X)[1]*proportion_training_set)),
                    rep(1, ceiling(dim(X)[1]*(1-proportion_training_set))))

        # proportion_training_set is the percentage of the data to use to
        # fit the model (2), the remaining part to compute the prediction
        # error:left_out (1)

        random_groups <- subset[sample(1:dim(X)[1])]


        # training and test set
        i <- 1

        left_out <- which(random_groups==i)

        # fitting of the model with the proportion_training_set% of the data and
        # computation of the CV error

        #sourceCpp("FLAME_functions_cpp.cpp")

        eigenvect_for_der=eigenvect[-dim(eigenvect)[1],]

        estimation_first_CV <- definition_beta_CV(X[-left_out,], Y[, -left_out],
                                 X[left_out,], Y[, left_out],
                                 eigenval, weights,
                                 function_computation, function_time,
                                 thres, number_non_zeros,
                                 lambda_H,lambda_K,
                                 ratio_lambda_H, num_lambda_H_NONad,
                                 early_CV, early_CV_thres, max_ite_nadp, target_inc,
                                 verbose, base::sum)
                                 #projection_basis, M_integ-1,
                                 #derivatives, eigenvect_for_der, XBd[,left_out])




        ## @A: estimation_first_CV$error is the \norm{Y_pred - Y_test}_H for different lambda.
        ## @A: lambda_first_selected is not the optimum lambda. It is the index of the lambda.

        ## @A: estimation_first_CV$error is a matrix (#lambda_H)*(#lambda_K)
        lambda_H_first_selected <- which(estimation_first_CV$error==min(estimation_first_CV$error[estimation_first_CV$error>0]) , arr.ind = TRUE)[1] # optimum lambda_H
        lambda_K_first_selected <- which(estimation_first_CV$error==min(estimation_first_CV$error[estimation_first_CV$error>0]) , arr.ind = TRUE)[2] # optimum lambda_K


#######################################################
#
# Part 2.3: estimation_first_definite, weights_new and
#
#######################################################


        if (length(estimation_first_CV$Pred_opt)==0)
        {
            print('No significant predictors indentified.')
            result <- list(beta=NULL,
                           beta_no_adaptive=NULL,
                           predictors=NULL,
                           predictors_no_adaptive=NULL)
            return(result)
        }else
            {
                if(length(estimation_first_CV$Pred_opt)==1)
                {

                    ## @A: In the following matrices, we have just one column because the length(estimation_first_definite$Pred)==1
                    beta_selected <- matrix(estimation_first_CV$Beta_opt[,estimation_first_CV$Pred_opt],
                                            length(estimation_first_CV$Beta_opt[,estimation_first_CV$Pred_opt]), 1)
                    X2_data <- matrix(X[,estimation_first_CV$Pred_opt], length(X[,estimation_first_CV$Pred_opt]),1)

                }

            else{
                beta_selected <- estimation_first_CV$Beta_opt[,estimation_first_CV$Pred_opt]
                X2_data <- X[,estimation_first_CV$Pred_opt]
            }
            }



            # defintion of the weights
            weights_new <- 1/norm_matrix_H(beta_selected)

            ## @A: For adaptive step, we just consider the non zero predictors and delete the zero ones.
            ## you can see the length of weights_new can be less than the initial weights.

############################################################################
#
# Part 3: RUN adaptive step
#
############################################################################


#######################################################
#
# Part 3.1: estimation_second
#
#######################################################


            if (verbose) print('Adaptive step')


            flds <- createFolds(1:dim(X)[1], k = fold_ad, list = TRUE, returnTrain = FALSE)

            SUM_MAT_CV = matrix(0,number_lambda_H,length(lambda_K))
            SUM_MAT_der = matrix(0,number_lambda_H,length(lambda_K))
            #error_der = rep(0,fold_ad)

      for( f in 1:fold_ad){


      left_out <- flds[[f]]



      estimation_second_CV <- definition_beta_CV(as.matrix(X2_data[-left_out,]), Y[, -left_out],
                                        as.matrix(X2_data[left_out,]), Y[, left_out],
                                        eigenval, weights_new,
                                        function_computation, function_time,
                                        thres, number_non_zeros,
                                        lambda_H, lambda_K,
                                        ratio_lambda_H, number_lambda_H,
                                        early_CV, early_CV_thres, max_ite_adp,target_inc,
                                        verbose, base::sum)
                                        #projection_basis, M_integ-1,
                                        #derivatives, eigenvect_for_der, XBd[,left_out])


      A <- estimation_second_CV$error

      A [A ==0] <- Inf

      SUM_MAT_CV = SUM_MAT_CV + A

      # B <- estimation_second_CV$error_der
      #
      # B [B ==0] <- Inf
      #
      # SUM_MAT_der = SUM_MAT_der + B


      # # # #derivatives
      # B_pred_der_time <- t(derivatives %*%estimation_second_CV$Beta_opt[,estimation_second_CV$Pred_opt])
      # Y_true_der_time <- X[,estimation_second_CV$Pred_opt]%*%B_pred_der_time
      # Y_pred_der <- projection_basis(Y_true_der_time, eigenvect[-dim(eigenvect)[1],], M_integ)
      #
      # diff_der= Y_true_der - Y_pred_der
      #
      # error_der[f]=sqrt(sum(apply(diff_der,2,function(x) sum(x^2))))

      print(paste("fold",f,"   ,"))
            }


            Mean_Error_CV=as.matrix(SUM_MAT_CV/fold_ad)
            # Mean_Error_der=as.matrix(SUM_MAT_der/fold_ad)

            ind_min_cv = as.vector(which(Mean_Error_CV==min(Mean_Error_CV[which(Mean_Error_CV>0)]) , arr.ind = TRUE)[1,])


            min_pred_CV=min(Mean_Error_CV)

            # if(length(lambda_K)==1)
            # corres_pred_der=Mean_Error_der[ind_min_cv[1]]
            # else
            # corres_pred_der=Mean_Error_der[ind_min_cv]
            #
            # min_pred_der=min(Mean_Error_der)

            lambda_K_opt=lambda_K[ind_min_cv[2]]
            lambda_H_opt=estimation_second_CV$lambda_H[ind_min_cv[1]]



###################################
#
# Run for the whole data with best best lambda_H and lambda_K
#
###################################


            if (verbose) print('Adaptive step for optimum parameters')

            estimation_second <- definition_beta(as.matrix(X2_data), Y,
                                        eigenval, weights_new,
                                        function_computation, function_time,
                                        thres, number_non_zeros,
                                        lambda_H_opt, lambda_K_opt,
                                        ratio_lambda_H, number_lambda_H,
                                        verbose,base::sum)




#######################################################
#
# Part 3.3:  estimation_second_definite and finding lamda which implies minimum CV
#
#######################################################


#######################################################
#
# Part 3.3:  The output values
#
#######################################################

            predictors_2=estimation_second$Pred # final set of
            # predictors different from 0 (among the ones isolated in the
            # First: Non-Adaptive step)
            beta_2=estimation_second$Beta[,predictors_2] # estimated betas,
            # it is the matrix of the coefficients of the basis expansion
            # of the betas (with respect to the basis defined by the eigenvectors
            # of the kernel)

            ## @A: Pay attention: predictors_2 are the labels of nonzro predictors among the
            ## ones isolated in the First: Non-Adaptive step. So for finding the actual labels among the
            ## whole predictors we need to define "predictor_def" as follows

            predictor_def <- estimation_first_CV$Pred_opt[predictors_2] # index of
            # the non-zero predictors computed in the estimation

            beta_def <- matrix(0, dim(estimation_first_CV$Beta_opt)[1],
                               dim(estimation_first_CV$Beta_opt)[2])

            ## @A: "beta_def" is a J*I matrix which the nonzero ones estimated by estimation_second and
            ## rest of the columns are zero.

            lambda_H=estimation_second_CV$lambda_H
            lambda_K=estimation_second_CV$lambda_K






            beta_def[, predictor_def] <- beta_2 # final matrix of the estimated betas



            #XBhat=(beta_2)%*%t(X[,predictor_def])


            # # #derivatives
            #Bhat_true_der_time <- t(kernel_here$derivatives %*% beta_2)
            #Yhat_true_der_time <- X[,predictor_def]%*%Bhat_true_der_time
            #XBhatd <- projection_basis(Yhat_true_der_time, eigenvect[-dim(eigenvect)[1],], M_integ-1)


            #pred_error=sum(norm_matrix_H(XB-XBhat))
            #pred_error_der=sum(norm_matrix_H(XBd-XBhatd))

            # (still as coefficients of the kernel basis)
            result <- list(beta=beta_def,
                           beta_no_adaptive=estimation_first_CV$Beta_opt,
                           predictors=predictor_def,
                           predictors_no_adaptive=estimation_first_CV$Pred_opt,
                           Mean_Error_CV=Mean_Error_CV,
                           min_pred_CV=min_pred_CV,
                           index_lambdas=ind_min_cv,
                           lambda_H=lambda_H,
                           lambda_K=lambda_K,
                           lambda_K_opt=estimation_second_CV$lambda_k_opt,
                           lambda_H_opt=estimation_second_CV$lambda_H_opt,
                           L_matrix_nonad=estimation_first_CV$L_matrix,
                           L_matrix_ad=estimation_second_CV$L_matrix,
                           Mat_target_inc=estimation_second_CV$Mat_target_inc,
                           vec_number_lambda_H_computed_adp=estimation_second_CV$vec_number_lambda_H_computed,
                           vec_break_in_number_pred_adp=estimation_second_CV$vec_break_in_number_pred,
                           vec_break_in_early_cv_adp=estimation_second_CV$vec_break_in_early_cv,
                           vec_norm_H_diff_beta_inc_nadp=estimation_first_CV$vec_norm_H_diff_beta_inc,
                           vec_norm_H_diff_beta_inc_adp=estimation_second_CV$vec_norm_H_diff_beta_inc,
                           X=X,
                           Y=Y,
                           #beta_time=beta,
                           #XB=XB,
                           #XBd=XBd,
                           #XBhat=XBhat,
                           #XBhatd=XBhatd,
                           #pred_error=pred_error,
                           #pred_error_der=pred_error_der,
                           kernel_vec_der=derivatives,
                           kernel_vec=eigenvect,
                           kernel_val=eigenval,
                           T_domain=T_domain)

    return(result)

      }












###############
###############
############### Test
###############
###############


#
# N=200
# I=200
# I0=10
# nu_beta=2.5
# range=1/4
# variance=1 # for the rough beta in Matern
# nu_eps = 1.5
# range_eps=1/4 # for epsilon in Matern
#
#
# m <- 110 # total number of points
# T_domain <- seq(0, 1, length = m) # time points, length = m
# M_integ <- length(T_domain)/diff(range(T_domain))
#
# # # # # # #######
# # # #
# # # defintion of the design matrix X, in this specific case the
# # # covariance matrix C is the identity matrix
# mu_x <- rep(0, I)
# C <- diag(I)
# X <- mvrnorm(n=N, mu=mu_x, Sigma=C) # X is a N*I matrix
# X <- scale(X) # normalization
#
# # # # # # #######
# # # # #
# # # defintion of the coefficients
#
# hyp <- c(log(range), log(variance)/2) # set of parameters for the # Matern Covariance operator of beta
# mu_beta <- rep(0,m) # mean of the beta
# Sig_beta <- covMaterniso(nu_beta, rho = range, sigma = sqrt(variance) , T_domain)
# beta <- mvrnorm(mu=mu_beta, Sigma=Sig_beta, n=I0) # beta is a I0*m matrix. Each row is made by a matern process.
#
# # # # # # # I0 significant coefficients
# # # # # # # defintion of the random errors
#
# mu_eps <- rep(0, m)
# Sig_eps <- covMaterniso(nu_eps, rho = range_eps, sigma = sqrt(variance), T_domain)
# eps <- mvrnorm(mu=mu_eps, Sigma=Sig_eps, n=N) # eps is a N*m matrix
#
# # # # # # random errors
# # # #
# I_X <- sort(sample(1:I, I0)) # index of the I0 significant predictors
#
# Y_true <- X[,I_X]%*%beta
# Y_full <- X[,I_X]%*%beta + eps # Y_n observations (pointwise evaluation)
#
# M <- 50
#
# kernel_here <- generation_kernel(type = "exponential",parameter = 8,
#                                  domain = T_domain, thres = 0.99, return.derivatives = TRUE)
#
# eigenval <- kernel_here$eigenval
# eigenvect <- kernel_here$eigenvect
# derivatives <- kernel_here$derivatives
#
#
# # # # # preojection on the kernel basis of y and beta
# Y_matrix <- projection_basis(Y_full, eigenvect, M_integ)
# XB <- projection_basis(Y_true, eigenvect, M_integ)
#
#
# B_true <- projection_basis(beta, eigenvect, M_integ)
# matrix_beta_true_full <- matrix(0, dim(B_true)[1], I)
# matrix_beta_true_full[,I_X] <- B_true
#
# # # #derivatives
# B_true_der_time <- t(kernel_here$derivatives %*% B_true)
# Y_true_der_time <- X[,I_X]%*%B_true_der_time
# XBd <- projection_basis(Y_true_der_time, eigenvect[-dim(eigenvect)[1],], M_integ-1)
#
# Y <- Y_matrix
#
# ##############
#
# Y_total <- Y_full
# X_total <- X
#
# ##############
# # # #
# # #
# # #
# # #
# # #
# Output <- AFSSEN(X=X_total,Y=Y_total,T_domain = seq(0, 1, length = 50),
#                      # X   (a N*I numerical matrix)    N = #observations     I = #all_predictors
#                      # Y = Y_ful=XB+E   (a N*m matrix of pointwise evaluation!)   m = #time_points
#                      type_kernel="exponential",param_kernel=8,
#                      thres=0.02, number_non_zeros=20,
#                      ratio_lambda_H=0.001, number_lambda_H=100, num_lambda_H_NONad=50,
#                      lambda_H=numeric(), lambda_K=c(1,0.1,0.01),
#                      early_CV=0, early_CV_thres=0.01, max_ite_nadp=5, max_ite_adp=10, max_ite_final=10, target_inc=1,
#                      proportion_training_set=0.75, verbose=FALSE, fold_ad=2)