varselTBIC:

# #############################################################################
# ## Sequential & parallel robust forward greedy search REDDA 
# #############################################################################
# # In this implementation models are robustly estimated but the comparison is made
# # via BIC (aka non-robust information criterion) for the entire set of units
# # it does not work: see Section 5 of \cite{Greco2019} 
# 
# redda_varsel_gr_fwd <- function(Xtrain,
#                                 cltrain,
#                           emModels1 = c("E","V"),
#                           emModels2 = mclust.options("emModelNames"),
#                           alpha_Xtrain,
#                           forcetwo = TRUE,
#                           BIC.diff = 0,
#                           itermax = 100,
#                           nsamp= 100,
#                           parallel = FALSE,
#                           fit= TRUE,
#                           verbose = interactive())
# {
#   Xtrain <- data.matrix(Xtrain)
#   Ntrain <- nrow(Xtrain) # number of rows = number of observations
#   Ntrain_trim <- if(alpha_Xtrain!=0) {Ntrain - ceiling(Ntrain * alpha_Xtrain)} else {NULL}
#   d <- ncol(Xtrain) # number of columns = number of variables
#   if(is.null(colnames(Xtrain))){
#     colnames(Xtrain) <- paste("X", 1:d, sep = "")
#   }
#   G <- length(unique(cltrain))
# 
#   # Start "parallel backend" if needed
#   if(is.logical(parallel)) {
#     if (parallel) {
#       parallel <- clustvarsel::startParallel(parallel)
#       stopCluster <-
#         TRUE
#     } else {
#       parallel <- stopCluster <- FALSE
#     }
#   } else {
#     stopCluster <- if (inherits(parallel, "cluster"))
#       FALSE
#     else
#       TRUE
#     parallel <- clustvarsel::startParallel(parallel)
#   }
#   on.exit(if (parallel & stopCluster)
#     parallel::stopCluster(attr(parallel, "cluster")))
# 
#   # define operator to use depending on parallel being TRUE or FALSE
#   `%DO%` <- if(parallel) `%dopar%` else `%do%`
#   i <- NULL # dummy to trick R CMD check
# 
#   # First Step - selecting single variable
#   if(verbose) cat(paste("iter 1\n+ adding step\n"))
#   out <- foreach(i = 1:d) %DO% {
#     xBIC <- NULL
#     # Fit the single variable grouping models
#     try(xBIC <- redda_varsel(
#       X_p = NULL,
#       X_c = Xtrain[, i],
#       cltrain = cltrain,
#       alpha_Xtrain = alpha_Xtrain,
#       modelNames = emModels1,
#       nsamp = nsamp,
#       model = "GR"
#     )$Best,
#     silent = TRUE)
# 
#     # maxBIC is the maximum BIC over all grouping models fit
#     maxBIC <- if (is.null(xBIC$bic)) {
#       NA
#     } else {
#       BIC_computation(
#         X_p = NULL,
#         X_c = Xtrain[, i],
#         cltrain = cltrain,
#         fitted_model = xBIC,
#         model_type = "GR"
#       )
#     }
#     # Fit and get BIC for a single component no-group normal model
#     if (alpha_Xtrain != 0) {
#       best_subset <-
#         MASS::cov.mcd(x = Xtrain[, i], quantile.used = floor((1 - alpha_Xtrain) * Ntrain))$best
#       # I recover the best subset on which the mean and the variance are computed.
#       # Afterwards, I perform a robust single component no-grouping normal model using Mclust,
#       # this is done because cov.mcd (as well as covMcd in robustbase)
#       # use consistency correction that I do not want to include
# 
#       try(fit_no_grouping <-
#             mclust::Mclust(
#               Xtrain[best_subset, i],
#               G = 1,
#               modelNames = emModels1,
#               verbose = FALSE
#             ),
#           silent = TRUE)
#       oneBIC <- if (is.null(fit_no_grouping$bic)) {
#         NA
#       } else {
#         2*sum(
#           dnorm(
#             x = Xtrain[, i],
#             mean = fit_no_grouping$parameters$mean,
#             sd = sqrt(fit_no_grouping$parameters$variance$sigmasq),
#             log = TRUE
#           )
#         ) - 2*log(length(Xtrain[, i]))
#       }
#     } else {
#       try(oneBIC <- mclust::Mclust(
#         Xtrain[, i],
#         G = 1,
#         modelNames = emModels1,
#         verbose = FALSE
#       )$BIC[1],
#       silent = TRUE)
#     }
#     return(list(maxBIC, oneBIC, xBIC$modelName, xBIC$G))
#   }
#   maxBIC <- sapply(out, "[[", 1)
#   oneBIC <- sapply(out, "[[", 2)
#   # Difference between maximum BIC for grouping and BIC for no grouping
#   maxdiff <- maxBIC - oneBIC
#   # Find the single variable with the biggest difference between
#   # grouping and no grouping
#   m <- max(maxdiff[is.finite(maxdiff)])
#   arg <- which(maxdiff==m,arr.ind=TRUE)[1]
#   # This is our first selected variable/S is the matrix of currently selected
#   # grouping variables
#   S <- Xtrain[,arg,drop=FALSE]
#   # BICS is the BIC value for the grouping model with the variable(s) in S
#   BICS <- maxBIC[arg]
#   # NS is the matrix of currently not selected variables
#   NS <- Xtrain[,-arg,drop=FALSE]
#   # info records the proposed variable, BIC for the S matrix and difference
#   # in BIC for grouping versus no grouping on S, whether it was an
#   # addition step and if it was accepted
#   info <- data.frame(Var = colnames(S),
#                      BIC = BICS, BICdiff = maxdiff[arg],
#                      Step = "Add", Decision = "Accepted",
#                      Model = out[[arg]][[3]],
#                      G = out[[arg]][[4]],
#                      stringsAsFactors = FALSE)
# 
#   if(verbose)
#     { print(info[,c(1,3:5),drop=FALSE])
#       cat("iter 2\n+ adding step\n") }
# 
#   # Second Step - selecting second variable
#   out <- foreach(i = 1:ncol(NS)) %DO%
#   {
#     # cindepBIC is the BIC for the grouping model on S and the
#     # regression model of the new variable on S
#     try(fit_no_grouping <- redda_varsel(
#       X_p = NS[,i],
#       X_c = S,
#       cltrain = cltrain,
#       alpha_Xtrain = alpha_Xtrain,
#       modelNames = emModels1,
#       nsamp = nsamp,
#       model = "NG"
#     )$Best,
#     silent = TRUE)
#     
#     cindepBIC <- if(is.null(fit_no_grouping$bic)) {NA
#     } else {
#       BIC_computation(
#         X_p = NS[, i],
#         X_c = S,
#         cltrain = cltrain,
#         fitted_model = fit_no_grouping,
#         model_type = "NG"
#       )}
#     # Fit the cluster model on the two variables
#     sBIC <- NULL
#     try(fit_grouping <- redda_varsel(
#       X_p = NS[,i],
#       X_c = S,
#       cltrain = cltrain,
#       alpha_Xtrain = alpha_Xtrain,
#       modelNames = emModels2,
#       nsamp = nsamp,
#       model = "GR"
#     )$Best,
#         silent = TRUE)
#     # depBIC is the BIC for the grouping model with both variables
#     depBIC <- if (is.null(fit_grouping$bic)) {
#       NA
#     } else {
#       BIC_computation(
#         X_p = NS[, i],
#         X_c = S,
#         cltrain = cltrain,
#         fitted_model = fit_grouping,
#         model_type = "GR"
#       )
#     }
#     
#     return(list(cindepBIC, depBIC, fit_grouping$modelName, fit_grouping$G))
#   }
#   cindepBIC <- sapply(out, "[[", 1)
#   depBIC <- sapply(out, "[[", 2)
#   # cdiff is the difference between BIC for the models with variables' being
#   # grouping variables versus them being conditionally independent of
#   # the grouping
#   cdiff <- depBIC - cindepBIC
#   # Choose the variable with the largest difference
#   m <- max(cdiff[is.finite(cdiff)])
#   arg <- which(cdiff==m,arr.ind=TRUE)[1]
# 
#   # if forcetwo is true automatically add the best second variable,
#   # otherwise only add it if its difference is large than BIC.diff
#   if(forcetwo || cdiff[arg] > BIC.diff) {
#     k <- c(colnames(S), colnames(NS)[arg])
#     nks <- c(colnames(NS)[-arg])
#     BICS <- depBIC[arg]
#     info <- rbind(info, c(colnames(NS)[arg], BICS, cdiff[arg],
#                           "Add", "Accepted",
#                           out[[arg]][3:4]))
# 
#     S <- cbind(S, NS[, arg])
#     NS <- as.matrix(NS[, -arg])
#     colnames(S) <- k
#     colnames(NS) <- nks
#   } else {
#     info <- rbind(info, c(colnames(NS)[arg], BICS, cdiff[arg],
#                           "Add", "Rejected",
#                           out[[arg]][3:4]))
#   }
#   info$BIC <- as.numeric(info$BIC)
#   info$BICdiff <- as.numeric(info$BICdiff)
# 
#   if (verbose){
#     print(info[2, c(1, 3:5), drop = FALSE])
#   }
#   criterion <- 1
#   iter <- 2
#   while((criterion == 1) & (iter < itermax))
#   {
#     iter <- iter+1
#     check1 <- colnames(S)
# 
#     if(verbose) cat(paste("iter", iter, "\n"))
# 
#     # Adding step
#     if(verbose) cat("+ adding step\n")
#     # For the special case where we have removed all the grouping
#     # variables/S is empty
#     if(ncol(S)==0 || is.null(ncol(S))){
#         # We simply choose the same variable as in the first step and check
#         # whether the difference between the BIC for grouping versus not
#         # grouping is positive or not
#         m <- max(maxdiff[is.finite(maxdiff)])
#         arg <- which(maxdiff==m,arr.ind=TRUE)[1]
#         if(maxdiff[arg] > BIC.diff)
#           {
#             # if the difference is positive this variable is selected
#             # as a grouping variable
#             S <- matrix(c(Xtrain[,arg]), Ntrain, 1)
#             BICS <- maxBIC[arg]
#             colnames(S) <- colnames(Xtrain)[arg]
#             NS <- as.matrix(Xtrain[,-arg])
#             colnames(NS) <- colnames(Xtrain)[-arg]
#             info <- rbind(info, c(colnames(S), BICS, maxdiff[arg],
#                                   "Add","Accepted", NA, NA))
#           } else { # if the difference is not > BIC.diff no grouping variables exist
#             BICS <- NA
#             info <- rbind(info,
#                           c(colnames(Xtrain)[arg], BICS, maxdiff[arg],
#                             "Add", "Rejected", NA, NA))
#           }
#       } else { # Addition Step in general (for all cases except when S is empty)
#         if(ncol(NS) != 0 & !is.null(ncol(NS))) {
#             out <- foreach(i = 1:ncol(NS)) %DO%
#             {
#               # cindepBIC is the BIC for the grouping model on S and the
#               # regression model of the new variable on S
#               try(fit_no_grouping <- redda_varsel(
#                 X_p = NS[,i],
#                 X_c = S,
#                 cltrain = cltrain,
#                 alpha_Xtrain = alpha_Xtrain,
#                 modelNames = emModels2,
#                 nsamp = nsamp,
#                 model = "NG"
#               )$Best,
#               silent = TRUE)
# 
#               cindepBIC <-
#                 if (is.null(fit_no_grouping$bic)) {
#                   NA
#                 } else {
#                   BIC_computation(
#                     X_p = NS[, i],
#                     X_c = S,
#                     cltrain = cltrain,
#                     fitted_model = fit_no_grouping,
#                     model_type = "NG"
#                   )
#                 }
#               # Fit the cluster model on the two variables
#               fit_grouping <- NULL
#               try(fit_grouping <- redda_varsel(
#                 X_p = NS[,i],
#                 X_c = S,
#                 cltrain = cltrain,
#                 alpha_Xtrain = alpha_Xtrain,
#                 modelNames = emModels2,
#                 nsamp = nsamp,
#                 model = "GR"
#               )$Best,
#               silent = TRUE)
#               
#               # depBIC is the BIC for the grouping model with both variables
#               
#               depBIC <- if (is.null(fit_grouping$bic)) {
#                 NA
#               } else{
#                 BIC_computation(
#                   X_p = NS[, i],
#                   X_c = S,
#                   cltrain = cltrain,
#                   fitted_model = fit_grouping,
#                   model_type = "GR"
#                 )
#               }
#               return(list(cindepBIC, depBIC, fit_grouping$modelName, fit_grouping$G))
#             }
#             cindepBIC <- sapply(out, "[[", 1)
#             depBIC <- sapply(out, "[[", 2)
#             # cdiff is the difference between BIC for the models with
#             # variables' being grouping variables versus them being
#             # conditionally independent of the grouping
#             cdiff <- depBIC - cindepBIC
#             # Choose the variable with the largest difference
#             m <- max(cdiff[is.finite(cdiff)])
#             arg <- which(cdiff==m,arr.ind=TRUE)[1]
#             if(cdiff[arg] > BIC.diff)
#               { # if this difference is positive add this variable to S
#                 # and update the grouping model's BICS
#                 BICS <- depBIC[arg]
#                 k <- c(colnames(S),colnames(NS)[arg])
#                 nks <- c(colnames(NS)[-arg])
#                 info <- rbind(info,
#                               c(colnames(NS)[arg], BICS, cdiff[arg],
#                                 "Add", "Accepted", out[[arg]][3:4]))
#                 S <- cbind(S,NS[,arg])
#                 NS <- as.matrix(NS[,-arg])
#                 colnames(S) <- k
#                 colnames(NS) <- nks
#               } else {
#                 info <- rbind(info,
#                               c(colnames(NS)[arg], depBIC[arg], cdiff[arg],
#                                 "Add", "Rejected", out[[arg]][3:4]))
#               }
#           }
#       }
#     if(verbose) cat("- removing step\n")
#     # Removal Step for the special case where S contains only a single variable
#     if(ncol(S) == 1){
#         cdiff <- 0
#         oneBIC <- NA
#         if (alpha_Xtrain != 0) {
#           best_subset <-
#             MASS::cov.mcd(x = S, quantile.used = floor((1 - alpha_Xtrain) * Ntrain))$best
#           # I recover the best subset on which the mean and the variance are computed.
#           # Afterwards, I perform a robust single component no-grouping normal model using Mclust,
#           # this is done because cov.mcd (as well as covMcd in robustbase)
#           # use consistency correction that I do not want to include
# 
#           try(fit_no_grouping <-
#                 Mclust(
#                   S[best_subset,,drop=F],
#                   G = 1,
#                   modelNames = emModels1,
#                   verbose = FALSE
#                 ),
#               silent = TRUE)
#           
#           # BIC for the entire set of obs having robustly estimated model parameters
#           oneBIC <- 2*sum(
#             dnorm(
#               x = Xtrain[, i],
#               mean = fit_no_grouping$parameters$mean,
#               sd = sqrt(fit_no_grouping$parameters$variance$sigmasq),
#               log = TRUE
#             )
#           ) - 2*log(length(Xtrain[, i]))
#           
#         } else {
#           try(oneBIC <- Mclust(
#             as.matrix(S),
#             G = 1,
#             modelNames = emModels1,
#             verbose = FALSE
#           )$BIC[1],
#           silent = TRUE)
#         }
#         # Difference between maximum BIC for grouping and BIC
#         # for no grouping
#         cdiff <- c(BICS - oneBIC)
#         if(is.na(cdiff)) cdiff <- 0
#         # Check if difference is less than BIC.diff
#         if(cdiff <= BIC.diff){ # if negative remove the variable from S and set the BIC
#             # for the model to NA
#             BICS <- NA
#             info <- rbind(info, c(colnames(S), BICS, cdiff,
#                           "Remove", "Accepted", NA, NA))
#             k <- c(colnames(NS),colnames(S))
#             NS <- cbind(NS,S)
#             S <- NULL
#             colnames(NS) <- k
#           } else { # Otherwise leave S and BICS alone
#             info <- rbind(info, c(colnames(S),
#                           info[nrow(info),2], cdiff,
#                           "Remove", "Rejected",
#                           unlist(info[nrow(info),c(6,7)])))
#           }
#       } else { # Removal step in general (for all cases except when S is a single
#         # variable or empty)
#         if(ncol(S) >= 2) {
#             # Check if the data is at least 3 dimensional
#           name <- if(ncol(S) > 2) emModels2 else emModels1
#             out <- foreach(i = 1:ncol(S)) %DO%
#             {
#               try(fit_no_grouping <- redda_varsel(
#                 X_p = S[,i],
#                 X_c = S[,-i, drop=FALSE],
#                 cltrain = cltrain,
#                 alpha_Xtrain = alpha_Xtrain,
#                 modelNames = name,
#                 nsamp = nsamp,
#                 model = "NG"
#               )$Best,
#               silent = TRUE)
#               # cindepBIC is the BIC for the clustering model on the other
#               # variables in S and the regression model of the proposed
#               #  variable on the other variables in S
#               cindepBIC <- if (is.null(fit_no_grouping$bic)) {
#                 NA
#               } else{
#                 BIC_computation(X_p = S[,i],
#                                 X_c = S[,-i, drop=FALSE],
#                                 cltrain = cltrain, fitted_model = fit_no_grouping, model_type = "NG")
#               }
#               # rdep is the the BIC for the clustering model on the other
#               # variables in S (excluding the proposed variable)
#               rdep <- if (is.null(fit_no_grouping$bic_noreg)) {
#                 NA
#               } else{
#                 BIC_computation(X_p = S[,i],
#                                 X_c = S[,-i, drop=FALSE],
#                                 cltrain = cltrain, fitted_model = fit_no_grouping, model_type = "only_mixture")
#               }
#               return(list(cindepBIC, rdep, fit_no_grouping$modelName, fit_no_grouping$G))
#             }
#             cindepBIC <- sapply(out, "[[", 1)
#             rdep <- sapply(out, "[[", 2)
#             # depBIC is the BIC for the grouping model with all
#             # variables in S
#             depBIC <- BICS
#             # cdiff is the difference between BIC for the models with
#             # variables' being grouping variables versus them being
#             # conditionally independent of the grouping
#             cdiff <- depBIC - cindepBIC
#             # Choose the variable with the smallest difference
#             m <- min(cdiff[is.finite(cdiff)])
#             arg <- which(cdiff==m,arr.ind=TRUE)[1]
#             if(cdiff[arg] <= BIC.diff)
#               { # if this difference is less than BIC.diff remove this
#                 # variable from S and update the grouping model's BICS
#                 BICS <- rdep[arg]
#                 k <- c(colnames(NS),colnames(S)[arg])
#                 nks <- c(colnames(S)[-arg])
#                 info <- rbind(info,
#                               c(colnames(S)[arg], BICS, cdiff[arg],
#                                 "Remove", "Accepted", out[[arg]][3:4]))
#                 NS <- cbind(NS,S[,arg])
#                 S <- as.matrix(S[,-arg])
#                 colnames(S) <- nks
#                 colnames(NS) <- k
#               } else { info <- rbind(info,
#                            c(colnames(S)[arg], rdep[arg], cdiff[arg],
#                              "Remove", "Rejected", out[[arg]][3:4]))
#               }
#           }
#       }
#     info$BIC <- as.numeric(info$BIC)
#     info$BICdiff <- as.numeric(info$BICdiff)
# 
#     if(verbose)
#       print(info[seq(nrow(info)-1,nrow(info)),c(1,3:5),drop=FALSE])
#     # Check if the variables in S have changed or not
#     check2 <- colnames(S)
#     if(is.null(check2)) # all variables have been removed
#       { criterion <- 0 }
#     else
#       # if they have changed (either added one or removed one or changed one)
#       # then continue the algorithm (criterion is 1) otherwise stop
#       # (criterion is 0)
#       { if(length(check2) != length(check1))
#           { criterion <- 1 }
#         else
#           { criterion <- if(sum(check1==check2) != length(check1)) 1 else 0 }
#       }
#   }
# 
#   if(iter >= itermax)
#     warning("Algorithm stopped because maximum number of iterations was reached")
# 
#   # List the selected variables and the matrix of steps' information
#   info$BIC <- as.numeric(info$BIC)
#   info$BICdiff <- as.numeric(info$BICdiff)
#   # reorder steps.info
#   info <- info[,c(1,4,2,6,7,3,5),drop=FALSE]
#   colnames(info) <- c("Variable proposed", "Type of step",
#                       "BICclust", "Model", "G", "BICdiff", "Decision")
#   varnames <- colnames(Xtrain)
#   subset <- if(is.null(S)) NULL
#             else sapply(colnames(S), function(x) which(x == varnames))
# 
#   out <- list(variables = varnames,
#               subset = subset,
#               steps.info = info,
#               search = "greedy",
#               direction = "forward")
# 
#   class(out) <- "robust_varsel"
#   return(out)
# }