#############################################################################
## Sequential & parallel robust forward greedy search REDDA
#############################################################################
redda_varsel_greedy_forward <- function(Xtrain,
cltrain,
emModels1 = c("E", "V"),
emModels2 = mclust.options("emModelNames"),
alpha_Xtrain,
forcetwo = TRUE,
BIC.diff = 0,
itermax = 100,
nsamp = 100,
parallel = FALSE,
verbose = interactive())
{
Xtrain <- data.matrix(Xtrain)
Ntrain <- nrow(Xtrain) # number of rows = number of observations
Ntrain_trim <- if(alpha_Xtrain!=0) {Ntrain - ceiling(Ntrain * alpha_Xtrain)} else {NULL}
d <- ncol(Xtrain) # number of columns = number of variables
if(is.null(colnames(Xtrain))|any(duplicated(colnames(Xtrain)))){
colnames(Xtrain) <- paste("X", 1:d, sep = "")
}
G <- length(unique(cltrain))
# Start "parallel backend" if needed
if(is.logical(parallel)) {
if (parallel) {
parallel <- clustvarsel::startParallel(parallel)
stopCluster <-
TRUE
} else {
parallel <- stopCluster <- FALSE
}
} else {
stopCluster <- if (inherits(parallel, "cluster"))
FALSE
else
TRUE
parallel <- clustvarsel::startParallel(parallel)
}
on.exit(if (parallel & stopCluster)
parallel::stopCluster(attr(parallel, "cluster")))
# define operator to use depending on parallel being TRUE or FALSE
`%DO%` <- if(parallel) `%dopar%` else `%do%`
i <- NULL # dummy to trick R CMD check
# First Step - selecting single variable
if(verbose) cat(paste("iter 1\n+ adding step\n"))
out <- foreach(i = 1:d) %DO% {
xBIC <- NULL
# Fit the single variable grouping models
try(xBIC <- redda_varsel(
X_p = NULL,
X_c = Xtrain[, i],
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = emModels1,
nsamp = nsamp,
model = "GR"
)$Best,
silent = TRUE)
# maxBIC is the maximum BIC over all grouping models fit
maxBIC <- if(is.null(xBIC$bic)) NA else xBIC$bic
# Fit and get BIC for a single component no-group normal model
if (alpha_Xtrain != 0) {
best_subset <-
MASS::cov.mcd(x = Xtrain[, i], quantile.used = floor((1 - alpha_Xtrain) * Ntrain))$best
# I recover the best subset on which the mean and the variance are computed.
# Afterwards, I perform a robust single component no-grouping normal model using Mclust,
# this is done because cov.mcd (as well as covMcd in robustbase)
# use consistency correction that I do not want to include
try(oneBIC <-
mclust::Mclust(
Xtrain[best_subset, i],
G = 1,
modelNames = emModels1,
verbose = FALSE
)$BIC[1],
silent = TRUE)
} else {
try(oneBIC <- mclust::Mclust(
Xtrain[, i],
G = 1,
modelNames = emModels1,
verbose = FALSE
)$BIC[1],
silent = TRUE)
}
return(list(maxBIC, oneBIC, xBIC$modelName, xBIC$G))
}
maxBIC <- sapply(out, "[[", 1)
oneBIC <- sapply(out, "[[", 2)
# Difference between maximum BIC for grouping and BIC for no grouping
maxdiff <- maxBIC - oneBIC
# Find the single variable with the biggest difference between
# grouping and no grouping
m <- max(maxdiff[is.finite(maxdiff)])
arg <- which(maxdiff==m,arr.ind=TRUE)[1]
# This is our first selected variable/S is the matrix of currently selected
# grouping variables
S <- Xtrain[,arg,drop=FALSE]
# BICS is the BIC value for the grouping model with the variable(s) in S
BICS <- maxBIC[arg]
# NS is the matrix of currently not selected variables
NS <- Xtrain[,-arg,drop=FALSE]
# info records the proposed variable, BIC for the S matrix and difference
# in BIC for grouping versus no grouping on S, whether it was an
# addition step and if it was accepted
info <- data.frame(Var = colnames(S),
BIC = BICS, BICdiff = maxdiff[arg],
Step = "Add", Decision = "Accepted",
Model = out[[arg]][[3]],
G = out[[arg]][[4]],
stringsAsFactors = FALSE)
if(verbose)
{ print(info[,c(1,3:5),drop=FALSE])
cat("iter 2\n+ adding step\n") }
# Second Step - selecting second variable
out <- foreach(i = 1:ncol(NS)) %DO%
{
# cindepBIC is the BIC for the grouping model on S and the
# regression model of the new variable on S
try(cindepBIC <- redda_varsel(
X_p = NS[,i],
X_c = S,
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = emModels1,
nsamp = nsamp,
model = "NG"
)$Best,
silent = TRUE)
cindepBIC <- if(is.null(cindepBIC$bic)) NA else cindepBIC$bic
# Fit the cluster model on the two variables
sBIC <- NULL
try(sBIC <- redda_varsel(
X_p = NS[,i],
X_c = S,
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = emModels2,
nsamp = nsamp,
model = "GR"
)$Best,
silent = TRUE)
# depBIC is the BIC for the grouping model with both variables
depBIC <- if(is.null(sBIC$bic)) NA else sBIC$bic
return(list(cindepBIC, depBIC, sBIC$modelName, sBIC$G))
}
cindepBIC <- sapply(out, "[[", 1)
depBIC <- sapply(out, "[[", 2)
# cdiff is the difference between BIC for the models with variables' being
# grouping variables versus them being conditionally independent of
# the grouping
cdiff <- depBIC - cindepBIC
# Choose the variable with the largest difference
m <- max(cdiff[is.finite(cdiff)])
arg <- which(cdiff==m,arr.ind=TRUE)[1]
# if forcetwo is true automatically add the best second variable,
# otherwise only add it if its difference is large than BIC.diff
if(forcetwo || cdiff[arg] > BIC.diff) {
k <- c(colnames(S), colnames(NS)[arg])
nks <- c(colnames(NS)[-arg])
BICS <- depBIC[arg]
info <- rbind(info, c(colnames(NS)[arg], BICS, cdiff[arg],
"Add", "Accepted",
out[[arg]][3:4]))
S <- cbind(S, NS[, arg])
NS <- as.matrix(NS[, -arg])
colnames(S) <- k
colnames(NS) <- nks
} else {
info <- rbind(info, c(colnames(NS)[arg], BICS, cdiff[arg],
"Add", "Rejected",
out[[arg]][3:4]))
}
info$BIC <- as.numeric(info$BIC)
info$BICdiff <- as.numeric(info$BICdiff)
if (verbose){
print(info[2, c(1, 3:5), drop = FALSE])
}
criterion <- 1
iter <- 2
while((criterion == 1) & (iter < itermax))
{
iter <- iter+1
check1 <- colnames(S)
if(verbose) cat(paste("iter", iter, "\n"))
# Adding step
if(verbose) cat("+ adding step\n")
# For the special case where we have removed all the grouping
# variables/S is empty
if(ncol(S)==0 || is.null(ncol(S))){
# We simply choose the same variable as in the first step and check
# whether the difference between the BIC for grouping versus not
# grouping is positive or not
m <- max(maxdiff[is.finite(maxdiff)])
arg <- which(maxdiff==m,arr.ind=TRUE)[1]
if(maxdiff[arg] > BIC.diff)
{
# if the difference is positive this variable is selected
# as a grouping variable
S <- matrix(c(Xtrain[,arg]), Ntrain, 1)
BICS <- maxBIC[arg]
colnames(S) <- colnames(Xtrain)[arg]
NS <- as.matrix(Xtrain[,-arg])
colnames(NS) <- colnames(Xtrain)[-arg]
info <- rbind(info, c(colnames(S), BICS, maxdiff[arg],
"Add","Accepted", NA, NA))
} else { # if the difference is not > BIC.diff no grouping variables exist
BICS <- NA
info <- rbind(info,
c(colnames(Xtrain)[arg], BICS, maxdiff[arg],
"Add", "Rejected", NA, NA))
}
} else { # Addition Step in general (for all cases except when S is empty)
if(ncol(NS) != 0 & !is.null(ncol(NS))) {
if(ncol(S)==1) {
modelNames_NG <- emModels1 # if S is one-dimensional, univariate models need to be fitted in the No-Grouping structure
} else{
modelNames_NG <- emModels2
}
out <- foreach(i = 1:ncol(NS)) %DO%
{
# cindepBIC is the BIC for the grouping model on S and the
# regression model of the new variable on S
cindepBIC <- NULL
try(cindepBIC <- redda_varsel(
X_p = NS[,i],
X_c = S,
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = modelNames_NG,
nsamp = nsamp,
model = "NG"
)$Best,
silent = TRUE)
cindepBIC <- if(is.null(cindepBIC$bic)) NA else cindepBIC$bic
# Fit the cluster model on the two variables
sBIC <- NULL
try(sBIC <- redda_varsel(
X_p = NS[,i],
X_c = S,
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = emModels2,
nsamp = nsamp,
model = "GR"
)$Best,
silent = TRUE)
# depBIC is the BIC for the grouping model with both variables
depBIC <- if(is.null(sBIC$bic)) NA else sBIC$bic
return(list(cindepBIC, depBIC, sBIC$modelName, sBIC$G))
}
cindepBIC <- sapply(out, "[[", 1)
depBIC <- sapply(out, "[[", 2)
# cdiff is the difference between BIC for the models with
# variables' being grouping variables versus them being
# conditionally independent of the grouping
cdiff <- depBIC - cindepBIC
# Choose the variable with the largest difference
m <- max(cdiff[is.finite(cdiff)])
arg <- which(cdiff==m,arr.ind=TRUE)[1]
if(cdiff[arg] > BIC.diff)
{ # if this difference is positive add this variable to S
# and update the grouping model's BICS
BICS <- depBIC[arg]
k <- c(colnames(S),colnames(NS)[arg])
nks <- c(colnames(NS)[-arg])
info <- rbind(info,
c(colnames(NS)[arg], BICS, cdiff[arg],
"Add", "Accepted", out[[arg]][3:4]))
S <- cbind(S,NS[,arg])
NS <- as.matrix(NS[,-arg])
colnames(S) <- k
colnames(NS) <- nks
} else {
info <- rbind(info,
c(colnames(NS)[arg], depBIC[arg], cdiff[arg],
"Add", "Rejected", out[[arg]][3:4]))
}
}
}
if(verbose) cat("- removing step\n")
# Removal Step for the special case where S contains only a single variable
if(ncol(S) == 1){
cdiff <- 0
oneBIC <- NA
if (alpha_Xtrain != 0) {
best_subset <-
MASS::cov.mcd(x = S, quantile.used = floor((1 - alpha_Xtrain) * Ntrain))$best
# I recover the best subset on which the mean and the variance are computed.
# Afterwards, I perform a robust single component no-grouping normal model using Mclust,
# this is done because cov.mcd (as well as covMcd in robustbase)
# use consistency correction that I do not want to include
try(oneBIC <-
Mclust(
S[best_subset,,drop=F],
G = 1,
modelNames = emModels1,
verbose = FALSE
)$BIC[1],
silent = TRUE)
} else {
try(oneBIC <- Mclust(
as.matrix(S),
G = 1,
modelNames = emModels1,
verbose = FALSE
)$BIC[1],
silent = TRUE)
}
# Difference between maximum BIC for grouping and BIC
# for no grouping
cdiff <- c(BICS - oneBIC)
if(is.na(cdiff)) cdiff <- 0
# Check if difference is less than BIC.diff
if(cdiff <= BIC.diff){ # if negative remove the variable from S and set the BIC
# for the model to NA
BICS <- NA
info <- rbind(info, c(colnames(S), BICS, cdiff,
"Remove", "Accepted", NA, NA))
k <- c(colnames(NS),colnames(S))
NS <- cbind(NS,S)
S <- NULL
colnames(NS) <- k
} else { # Otherwise leave S and BICS alone
info <- rbind(info, c(colnames(S),
info[nrow(info),2], cdiff,
"Remove", "Rejected",
unlist(info[nrow(info),c(6,7)])))
}
} else { # Removal step in general (for all cases except when S is a single
# variable or empty)
if(ncol(S) >= 2) {
# Check if the data is at least 3 dimensional
name <- if(ncol(S) > 2) emModels2 else emModels1
out <- foreach(i = 1:ncol(S)) %DO%
{
try(sBIC <- redda_varsel(
X_p = S[,i],
X_c = S[,-i, drop=FALSE],
cltrain = cltrain,
alpha_Xtrain = alpha_Xtrain,
modelNames = name,
nsamp = nsamp,
model = "NG"
)$Best,
silent = TRUE)
# cindepBIC is the BIC for the clustering model on the other
# variables in S and the regression model of the proposed
# variable on the other variables in S
cindepBIC <- if(is.null(sBIC$bic)) NA else sBIC$bic
# rdep is the the BIC for the clustering model on the other
# variables in S (excluding the proposed variable)
rdep <- if(is.null(sBIC$bic_noreg)) NA else sBIC$bic_noreg
return(list(cindepBIC, rdep, sBIC$modelName, sBIC$G))
}
cindepBIC <- sapply(out, "[[", 1)
rdep <- sapply(out, "[[", 2)
# depBIC is the BIC for the grouping model with all
# variables in S
depBIC <- BICS
# cdiff is the difference between BIC for the models with
# variables' being grouping variables versus them being
# conditionally independent of the grouping
cdiff <- depBIC - cindepBIC
# Choose the variable with the smallest difference
m <- min(cdiff[is.finite(cdiff)])
arg <- which(cdiff==m,arr.ind=TRUE)[1]
if(cdiff[arg] <= BIC.diff)
{ # if this difference is less than BIC.diff remove this
# variable from S and update the grouping model's BICS
BICS <- rdep[arg]
k <- c(colnames(NS),colnames(S)[arg])
nks <- c(colnames(S)[-arg])
info <- rbind(info,
c(colnames(S)[arg], BICS, cdiff[arg],
"Remove", "Accepted", out[[arg]][3:4]))
NS <- cbind(NS,S[,arg])
S <- as.matrix(S[,-arg])
colnames(S) <- nks
colnames(NS) <- k
} else { info <- rbind(info,
c(colnames(S)[arg], rdep[arg], cdiff[arg],
"Remove", "Rejected", out[[arg]][3:4]))
}
}
}
info$BIC <- as.numeric(info$BIC)
info$BICdiff <- as.numeric(info$BICdiff)
if(verbose)
print(info[seq(nrow(info)-1,nrow(info)),c(1,3:5),drop=FALSE])
# Check if the variables in S have changed or not
check2 <- colnames(S)
if(is.null(check2)) # all variables have been removed
{ criterion <- 0 }
else
# if they have changed (either added one or removed one or changed one)
# then continue the algorithm (criterion is 1) otherwise stop
# (criterion is 0)
{ if(length(check2) != length(check1))
{ criterion <- 1 }
else
{ criterion <- if(sum(check1==check2) != length(check1)) 1 else 0 }
}
}
if(iter >= itermax)
warning("Algorithm stopped because maximum number of iterations was reached")
# List the selected variables and the matrix of steps' information
info$BIC <- as.numeric(info$BIC)
info$BICdiff <- as.numeric(info$BICdiff)
# reorder steps.info
info <- info[,c(1,4,2,6,7,3,5),drop=FALSE]
colnames(info) <- c("Variable proposed", "Type of step",
"BICclust", "Model", "G", "BICdiff", "Decision")
varnames <- colnames(Xtrain)
subset <- if(is.null(S)) NULL
else sapply(colnames(S), function(x) which(x == varnames))
out <- list(variables = varnames,
subset = subset,
steps.info = info,
search = "greedy",
direction = "forward")
class(out) <- "robvarsel"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.