# Function used in random initialization ---------------------------------
patterned_MCD_varsel <-
# main function that will be repeated nsamp times, run only if alpha_Xtrain!=0
function(nsamp,
Xtrain,
X_p,
cltrain,
ltrain,
alpha_Xtrain,
modelName, model) {
robust_start_varsel <-
function(Xtrain,
X_p,
cltrain,
ltrain,
alpha_Xtrain,
modelName, model) {
Ntrain <- nrow(Xtrain)
J_ind <-
tryCatch(
c(sapply(
split(seq(cltrain), cltrain),
sample,
size = ncol(Xtrain) + 1,
replace = FALSE
)),
error = function(e)
seq(cltrain)
)
#Init with the random (p + 1)-subset J_g
fitm <- tryCatch(
mclust::mstep(
modelName = modelName,
data = Xtrain[J_ind,,drop=F],
z = ltrain[J_ind,,drop=F]
),
error = function(e) {
list(parameters = NA)
}
)
if (model == "NG") {
lm_fit <- fit_reg(x=Xtrain[J_ind,,drop=F], y=X_p[J_ind,,drop=F])
}
emptyz <- TRUE
iter_max_patterned_MCD <- 50 # it would be nice to have this as an input option in the future, for the moment I keep it fixed. It is necessary since, for small number of variables, the while below might get stuck in an infinite loop, that is two obs keep being alternatively trimmed
iter_patterned_MCD <- 0
pos_old <-
1:ceiling(Ntrain * alpha_Xtrain) # used in the while loop
# initial random value just using the first ceiling(Ntrain * alpha_Xtrain) obs
# the estimation will continue running until for two consecutive iterations exactly
# the same obs are trimmed
criterion <- TRUE
while (criterion) {
iter_patterned_MCD <- iter_patterned_MCD + 1
# in performing robust estimation I discard those obs with smallest value of
# component density given their label
fitetrain <-
tryCatch(
do.call(mclust::estep, c(list(data = Xtrain), fitm)),
error = function(e) {
list(z = NA)
}
)
Xtrain_fit <- NULL # I initialize its value for preventing erros when the EM algorithm fails
emptyz <- TRUE
if (all(!is.na(fitetrain$z))) {
emptyz <- FALSE
D_Xtrain_cond <-
do.call(mclust::cdens, c(list(data = Xtrain, # computing the component density, this is done because I am interested in trimming also obs that might have
logarithm = T), fitm)) # been WRONGLY assinged to a class
ind_D_Xtrain_cdens <-
cbind(1:Ntrain, mclust::map(ltrain)) # matrix with obs and respective group from ltrain
D_Xtrain <-
D_Xtrain_cond[ind_D_Xtrain_cdens] #if model==GR Xtrain=cbind(X_c,X_p) so D_Xtrain is D_grouping
if (model == "NG") {
alpha <- lm_fit$coefficients[1]
beta <- lm_fit$coefficients[-1]
sigma <- sqrt(mean(residuals(lm_fit) ^ 2))
D_reg <-
as.vector(dnorm(
X_p,
mean = alpha + Xtrain[, names(beta), drop = FALSE] %*% beta,
sd = sigma,
log = T
))
D_Xtrain <- D_Xtrain + D_reg #D_{No Grouping}
}
pos_trimmed_train <-
which(D_Xtrain <= (sort(D_Xtrain, decreasing = F)
[[ceiling(Ntrain * alpha_Xtrain)]]))
if (length(pos_trimmed_train) != ceiling(Ntrain * alpha_Xtrain)) {
#condition due to the fact that for some models (the simplest ones usually) it might happen that 2 obs have exactly the same comp prob, leading to errors
pos_trimmed_train <-
pos_trimmed_train[1:ceiling(Ntrain * alpha_Xtrain)]
}
ltrain_fit <- ltrain[-pos_trimmed_train, , drop = F]
Xtrain_fit <- Xtrain[-pos_trimmed_train, , drop = F]
fitm <-
tryCatch(
mclust::mstep(
modelName = modelName,
data = Xtrain_fit,
z = ltrain_fit
),
error = function(e) {
list(parameters = NA)
}
)
if (model == "NG") {
X_p_fit <- X_p[-pos_trimmed_train, , drop =F]
lm_fit <-
fit_reg(x = Xtrain_fit, y = X_p_fit)
}
if (all(pos_old == pos_trimmed_train) | iter_patterned_MCD >= iter_max_patterned_MCD) {
criterion <- FALSE
} else {
pos_old <- pos_trimmed_train
}
}
else {
criterion <- FALSE
}
}
if (!emptyz) {
mTau.train <-
matrix(log(fitm$parameters$pro),
nrow(Xtrain_fit),
fitm$G,
byrow = TRUE)
lDensity.train <- do.call(mclust::cdens, c(list(
data = Xtrain_fit,
logarithm = TRUE
), fitm))
sum.train <- mTau.train + lDensity.train
mat.train <- ltrain_fit * sum.train
ll <- sum(mat.train)
if (model == "NG") {
alpha <- lm_fit$coefficients[1]
beta <- lm_fit$coefficients[-1]
sigma <- sqrt(mean(residuals(lm_fit) ^ 2))
ll_reg <-
sum(dnorm(
X_p_fit,
mean = alpha + Xtrain_fit[, names(beta), drop = FALSE] %*% beta,
sd = sigma,
log = T
))
ll <- ll + ll_reg
#Adding the \hat{\boldsymbol{\beta}^{(0)}} and \hat{\boldsymbol{\sigma}^{(0)}} at the returned estimates
fitm$lm_fit <- lm_fit
fitm$ll_reg <- ll_reg
}
} else {
ll <- NA
}
fitm$X <- Xtrain_fit # this is used in the initialization of raedda_transductive_model: for VVE and EVE models for which the constraint is not satisfied the data from which the M-step is computed are needed for the MM algorithm
return(list(ll = ll, fitm = fitm))
}
n_init <-
replicate(
nsamp,
expr = robust_start_varsel(Xtrain,
X_p,
cltrain,
ltrain,
alpha_Xtrain,
modelName, model),
simplify = T
)
if(!all(is.na(n_init[1,]))){
ind <- which.max(n_init[1, ])
} else {
ind <- 1 #if no initial subsample worked, get the first one and then the alg will break down further in the code
}
return(n_init[, ind])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.