# R/estimation_beta.R In ardeeshany/FLAME: FLAME for high-dimensional Function-on-Scalar regression problems

#### Documented in estimation_beta

#' FLAME for the Function-on-Scalar regression coefficients.
#'
#' This function computes  FLAME for the Function-on-Scalar
#' regression problem,
#' returning the significant parameters and
#' the estimated coefficients in the kernel basis.
#'
#' @param X matrix. \code{N} \eqn{\times} \code{I} design matrix. It has
#' standardized columns.
#' @param Y matrix. \code{J} \eqn{\times} \code{N} matrix containing the
#' coefficients of the projection of the \code{N} response functions on the kernel
#' basis. In particular column \eqn{n} contains the basis projection of \eqn{y_n}.
#' @param eigenval vector. \code{J}-length vector containing the eigenvalues
#' of the kernel.
#' @param NoI scalar. integer, maximum number of iterations in the
#' Coordinate-Descent loop.
#' @param thres scalar. tolerance on the K-norm of the increment of the estimation
#' to stop the Coordinate-Descent loop
#' @param number_non_zeros scalar. integer,
#' threshold on the number of non zeros parameters to be detected. It is the
#' kill switch parameter presented in the Vignette.
#' @param ratio_lambda scalar. ratio to compute the minimum value of lambda. The
#' maximum \eqn{\lambda_{\max}} is computed as the minimum value which makes all the coefficients
#' equal to zero. The minimum is the product \code{ratio_lambda}\eqn{\times \lambda_{\max}}.
#' @param number_lambda scalar. integer, length of the grid for the
#' \eqn{\lambda} parameter.
#' @param proportion_training_set scalar. value in (0,1), the
#' proportion for the training set for the Cross Validation.
#' @param verbose bool. If \code{TRUE} the progression of the algorithm in the
#' adaptive and non adaptive step is shown. If \code{FALSE} no output is shown.
#' Default is \code{FALSE}.
#'
#' @return list containing:
#'  \itemize{
#'  \item \code{beta} matrix. \code{J} \eqn{\times} \code{I} matrix of the final estimated
#'  coefficients in the kernel basis.
#'  \item \code{beta_no_adaptive}  matrix \code{J} \eqn{\times} \code{I} matrix of
#'  the coefficients estimated at the end of the non-adaptive step.
#'  \item \code{predictors} vector of the indices of the non-zero estimated predictors.
#'  \item \code{predictors_no_adaptive}  vector of the indices of the non-zeros
#'  predictor estimated after the non-adaptive step.}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' data(simulation)
#' data(SobolevKernel)
#' I0 <- dim(B_true)[2]
#' time <- proc.time()
#' FLAME_estimation <- estimation_beta(X = X,
#'                                  Y = Y_matrix,
#'                                  eigenval = eigenval,
#'                                  NoI = 10,
#'                                  thres = 0.1,
#'                                  number_non_zeros = I0*2,
#'                                  ratio_lambda = 0.01,
#'                                  number_lambda = 100,
#'                                  proportion_training_set = 0.75,
#'                                  verbose = FALSE)
#' duration <- proc.time()-time
#' duration
#' names(FLAME_estimation)
#' }
#'
#' @import Rcpp
#' @import nloptr
#' @useDynLib flm, .registration = TRUE
#'

estimation_beta <- function(X, Y, eigenval,
NoI, thres, number_non_zeros, ## @A: These three variables are stopping criteria
ratio_lambda, # lambda,
number_lambda, proportion_training_set,
verbose = FALSE)
{
#
# @param BIC_adaptive_step bool. if \code{TRUE} the \eqn{\lambda} paramter
# of the adaptive step is choosen with the Bayesian Index Criteria (BIC); othewise
# Cross-Validation is used.
# @param lambda scalar. value of \eqn{\lambda}.
# set this paramter to \code{numeric()} to ignore it and run the
# algorithm to identify the proper value of \eqn{\lambda}. The vector
# is defined with \code{ratio_lambda} and \code{number_lambda} parameters.
# The best value of \eqn{\lambda} is detected with Cross-Validation.
# if \code{lambda} is different from \code{numeric()}
# further parameters are ignored

# estimation_norm_R is the function used to compute the norm of
# beta. It is numerically solved with the NLOPTR optimization tool.

############################################################################
#
# Part 1: Defining the function for numerically estimate norm beta in K
#
############################################################################
### %% Need to modify for AFSSEN

estimation_norm_R <- function(lambda, omega, B, tau)
{

## @A: the function we want to optimize in FLAME is:
## 1=\sum_{j=1}^{\infty} \dfrac{tau_j < \beta_j , v_j >^2 }{ tau_j \norm{\beta_j}_K + \lambda omega_j}^2

## @A: nloptr is an R package for nonlinear optimization.

## [email protected]: why do you optimiza |1-1/sum| instead of |1-sum|?

optimization_function <- function(x, lambda, omega, B, tau)
{
numerat <- B^2 * tau
denom <- (tau * x + lambda * omega)^2
tot <- 1 - 1 / (sum(numerat/denom))
return(abs(tot))
}

## [email protected]: Are these initial values are fixed or we need to change them in different situation?
opts= list("algorithm"= "NLOPT_LN_COBYLA", "xtol_rel"=1.0e-16)

## [email protected]: why it ended up with semicolon (;)?
ott_model <- nloptr(0, optimization_function, opts=opts, lb=0, omega=omega,lambda=lambda, B=B, tau=tau);

return(ott_model$solution) } ############################################################################ # # Part 2: RUN non-adaptive version to estimte weights for adaptive step # ############################################################################ ####################################################### # # Part 2.1: estimation_first # ####################################################### function_computation <- estimation_norm_R weights = rep(1, dim(X)[2]) # First step: all the weights are set to 1 # first run to identify the possible values of lambda # (from lambda_max -all the predictors set to 0- to the the value for which # number_non_zeros predicotors are different from 0) num_lambda_NONad <- number_lambda #num_lambda_NONad <- round(number_lambda/5) # for this first step (the most # computationally expensive) we can reduce the number of lambda examined. # Then (in the Second: Adaptive Step) # we will consider the whole set of lambdas to define the final # estimation # num_lambda_NONad <- number_lambda # no reduction lambda <- numeric() if (verbose) print('Non-adaptive step') ## [email protected]: what is the matrix(, 1, 0) for Beta here? ## @A: we can import the vector of lambda in "lambda" or same as here put it numeric() (means nothing) ## and ask the definition_beta function generate the log equispaced lambda function with initializing ratio_lambda and number_lambda. estimation_first <- definition_beta(X, Y, matrix(, 1, 0), eigenval, weights, NoI, function_computation, thres, number_non_zeros, lambda, ratio_lambda, num_lambda_NONad, BIC = 0, verbose = FALSE) ## @A: estimation_first$Lambda_vect is a vector {lambda_max,....,lambda*} the lambdas were run in definition_beta.
## lambda* can be lambda_min or some lambda larger than it.

lambda_first <- estimation_first$Lambda_vect ####################################################### # # Part 2.2: estimation_first_CV and finding lamda which implies minimum CV # ####################################################### # Corss Valiation to identify the optimum value of lambda among # the set of the possible values previously # identified (the optimum lambda will be # the ones which minimizes the cross validation error) if (verbose) print('Validation on the test set: identification of lambda') subset <- c(rep(2, ceiling(dim(X)[1]*proportion_training_set)), rep(1, ceiling(dim(X)[1]*(1-proportion_training_set)))) # proportion_training_set is the percentage of the data to use to # fit the model (2), the remaining part to compute the prediction # error:left_out (1) ## [email protected]: there exists a set.seed for lable of training set! set.seed(16589) random_groups <- subset[sample(1:dim(X)[1])] # definition of the # training and test set i <- 1 left_out <- which(random_groups==i) # fitting of the model with the proportion_training_set% of the data and # computation of the CV error estimation_first_CV <- definition_beta_CV(X[-left_out,], Y[, -left_out], X[left_out,], Y[, left_out], eigenval, weights, NoI, function_computation, thres, number_non_zeros, lambda_first, verbose = FALSE) # print(estimation_first_CV$error)

## @A: estimation_first_CV$error is the \norm{Y_pred - Y_test}_H for different lambda. ## @A: lambda_first_selected is not the optimum lambda. It is the index of the lambda. lambda_first_selected <- which.min(estimation_first_CV$error) # optimum
# lambda. It minimizes the CV error

#######################################################
#
# Part 2.3: estimation_first_definite, weights_new and
#
#######################################################

if (verbose) { print(paste("Non adaptive step: final estimation with the optimum lambda")) }

# if (estimation_first_CV$reached_last==1) #{ # if the CV error identifies the last value of lambda, # we take as estimation the result of the previous run # estimation_first_definite <- estimation_first # } # if (estimation_first_CV$reached_last==0){
# otherwise we compute again the estimation with
# to the optimum value of lambda

## @A: Since lambda_first=estimation_first$Lambda_vect, is a vector {lambda_max,....,lambda*} the lambdas were run in definition beta. ## So we import the lambda_start, we do not need to initialize the num_lambda and ratio_lambda, so we put them 0. It can be anything else. ## @A: here the weights are still 1. ## @A: we reduced the range of lambda from [lambda_max, lambda_min_by_CV] ## [email protected]: In part 2.1 we calculated definition_beta for lambda={\lambda_max,....,\lambda_min}. In ## part 2.2 we find the lambda_first_selected which is the label of tha lambda gives the min C.V. ## But again here calculate definition_beta for lambda={\lambda_max , ... , \lambda_{lambda_first_selected}} which ## the result should be a subset of results of part 2.1. Don't you think that? It does not ## affect on final result but makes the code slower. estimation_first_definite <- definition_beta(X, Y, matrix(, 1, 0), #estimation_first_CV$Beta, #,
eigenval, weights, NoI,
function_computation, thres,
number_non_zeros,
lambda_first[1:lambda_first_selected],
0, 0, BIC = 0, verbose = FALSE)
#    }

# adaptive step to improve the estimation of the
# predicotors and to make a further selection.

if(verbose) {print (paste("Adaptive Step: update of the estimation of the",
length(estimation_first_definite$Pred), "non zeros predictors identified."))} # isolation of the significant predictors fitted by the Non-Adaptive step. ## @A: estimation_first_definite$Pred is just the label of nonzero predictors

if (length(estimation_first_definite$Pred)==0) { print('No significant predictors indentified.') result <- list(beta=NULL, beta_no_adaptive=NULL, predictors=NULL, predictors_no_adaptive=NULL) return(result) }else{ if(length(estimation_first_definite$Pred)==1)
{

## @A: In the following matrices, we have just one column because the length(estimation_first_definite$Pred)==1 beta_selected <- matrix(estimation_first_definite$Beta[,estimation_first_definite$Pred], length(estimation_first_definite$Beta[,estimation_first_definite$Pred]), 1) X2_data <- matrix(X[,estimation_first_definite$Pred], length(X[,estimation_first_definite$Pred]),1) }else { beta_selected <- estimation_first_definite$Beta[,estimation_first_definite$Pred] X2_data <- X[,estimation_first_definite$Pred]
}

# defintion of the weights
weights_new <- 1/norm_matrix_K(beta_selected, eigenval)

## @A: For adaptive step, we just consider the non zero predictors and delete the zero ones.
## you can see the length of weights_new can be less than the initial weights.

############################################################################
#
# Part 3: RUN adaptive step
#
############################################################################

#######################################################
#
# Part 3.1: estimation_second
#
#######################################################

# the optimum value of lambda can be identified both with Cross-Validation
# and BIC. (depending on the BIC_adaptive_step input parameter)

## @A: Here we changed X to X2_data. It's because we don't want to consider the zero predictors.

if (BIC_adaptive_step == TRUE) # BIC
{
estimation_second <- definition_beta(X2_data, Y, matrix(, 1, 0),
eigenval, weights_new, NoI,
function_computation, thres,
number_non_zeros, numeric(),
ratio_lambda, number_lambda, BIC = 1, verbose = FALSE)
lambda_second <- estimation_second$Lambda_vect } if (BIC_adaptive_step == FALSE) # Cross-Validation { estimation_second <- definition_beta(X2_data, Y, matrix(, 1, 0), eigenval, weights_new, NoI, function_computation, thres, number_non_zeros, numeric(), ratio_lambda, number_lambda, BIC = 0, verbose = FALSE) lambda_second <- estimation_second$Lambda_vect

#######################################################
#
# Part 3.2: estimation_second_CV and finding lamda which implies minimum CV
#
#######################################################

# Cross Validation procedure
if (verbose) print('Validation on the test set: identification of lambda')

subset <- c(rep(2, ceiling(dim(X2_data)[1]*proportion_training_set)), rep(1, ceiling(dim(X2_data)[1]*(1-proportion_training_set)))) # 75% test set (2), 25% training:left_out (1)
set.seed(16589)
random_groups <- subset[sample(1:dim(X2_data)[1])]

i <- 1 # training set is 1

left_out <- which(random_groups==i)
estimation_second_CV <- definition_beta_CV(X2_data[-left_out,], Y[, -left_out],
X2_data[left_out,], Y[, left_out],
eigenval, weights_new, NoI,
function_computation,
thres, number_non_zeros,
lambda_second, verbose = FALSE)

#print(estimation_second_CV$error) ## @A: lambda_second_selected is the index of the lambda which implies the min C.V. lambda_second_selected <- which.min(estimation_second_CV$error)

if (lambda_second_selected == lambda_second[length(lambda_second)])
{
warning('Last value of the gird of lambda selected by Cross Validation. Change the grid for lambda and run FLAME again!')
}

# print(lambda_second_selected)
# plot(lambda_first,estimation_first_CV$error, pch=19, main='First step: selection of the value of lambda') # points(lambda_first[lambda_first_selected], estimation_first_CV$error[lambda_first_selected], pch=19, col=2)

if(verbose)
{
print(paste("Non adaptive step: final estimation with the optimum lambda")) }

# if (estimation_second_CV$reached_last==1) # { # if the CV error identifies as optimum the last value of lambda, # we take as estimation the result of the previous run # estimation_second_definite <- estimation_second # } # if (estimation_second_CV$reached_last==0){
# otherwise we compute again the estimation with the
# optimum value of lambda

#######################################################
#
# Part 3.3:  estimation_second_definite and finding lamda which implies minimum CV
#
#######################################################

## [email protected]: Same as part 2.3 I have the problem here.
## In part 3.1 we calculated definition_beta for lambda={\lambda_max,....,\lambda_min}. In
## part 3.2 we find the lambda_second_selected  which is the label of tha lambda gives the min C.V.
## But again here we are gonna calculate the definition_beta for lambda={\lambda_max , ... , \lambda_{lambda_first_selected}} which
## the result should be a subset of results of part 3.1. Don't you think that? It does not affect on final result but does not change anything.

estimation_second_definite <- definition_beta(X2_data, Y,
matrix(, 1, 0), #estimation_second_CV$Beta, eigenval, weights_new, NoI, function_computation, thres, number_non_zeros, lambda_second[1:lambda_second_selected], 0, 0, BIC = 0, verbose = FALSE) # } estimation_second <- estimation_second_definite } ####################################################### # # Part 3.3: The output values # ####################################################### predictors_2=estimation_second$Pred # final set of
# predictors different from 0 (among the ones isolated in the
beta_2=estimation_second$Beta[,predictors_2] # estimated betas, # it is the matrix of the coefficients of the basis expansion # of the betas (with respect to the basis defined by the eigenvectors # of the kernel) ## @A: Pay attention: predictors_2 are the labels of nonzro predictors among the ## ones isolated in the First: Non-Adaptive step. So for finding the actual labels among the ## whole predictors we need to define "predictor_def" as follows predictor_def <- estimation_first_definite$Pred[predictors_2] # index of
# the non-zero predictors computed in the estimation

beta_def <- matrix(0, dim(estimation_first_definite$Beta)[1], dim(estimation_first_definite$Beta)[2])

## @A: "beta_def" is a J*I matrix which the nonzero ones estimated by estimation_second and
## rest of the columns are zero.

beta_def[, predictor_def] <- beta_2 # final matrix of the estimated betas
# (still as coefficients of the kernel basis)

if (verbose) {print(paste("Total number of non zeros predictor estimated is ", length(predictors_2)))}

result <- list(beta=beta_def,
beta_no_adaptive=estimation_first_definite$Beta, predictors=predictor_def, predictors_no_adaptive=estimation_first_definite$Pred)
return(result)
}

}

## End

ardeeshany/FLAME documentation built on Sept. 25, 2017, 9:55 a.m.