#' CrossValidate FCnet model
#'
#' This function is a wrapper around `glmnetUtils::cva.glmnet()`, which is
#' itself built around `glmnet::cv.glmnet()`. For extended documentation,
#' the readers are encouraged to consult the respective pages.
#' `cv_FCnet()` requires two objects at minimum: `y` is a vector or data.frame
#' with exactly one column, corresponding to the (behavioral) score to predict; `x`
#' is a data.frame or a list of lists with an entry named "Weights",
#' which includes the independent variables. `x` can be - and is meant to be -
#' one object created by `reduce_featuresFC()`, but this is not strictly necessary.
#' The crossvalidated lambda and alpha parameters are returned. For the model,
#' use the `FCnetLOO()` function to avoid overfitting.
#' Differently from `glmnet::cv.glmnet()`, here the
#' mean absolute error is minimized by default; you can change this parameter
#' directly in the function or by running `optionsFCnet(optionsFCnet= "mse")`.
#'
#' @param y The dependent variable, typically behavioral scores to predict.
#' This can be a vector or a single data.frame column.
#' @param x The independent variables, typically neural measures that have
#' been already summarised through data reduction techniques
#' (e.g. ICA, PCA): an object created by `reduce_featuresFC()` will do. If such
#' an object is passed to this function, the "Weights" slot is taken as x.
#' A list can be passed to this function: in this case the function needs an
#' entry named "Weights". Otherwise, a data.frame can be passed to x.
#' @param alpha Value(s) that bias the elastic net toward ridge regression
#' (alpha== 0) or LASSO regression (alpha== 1). If a vector of alpha values
#' is supplied, the value is optimized through crossvalidation.
#' It defaults to a vector ranging from 0 to 1 with steps of 0.1.
#' The crossvalidated alpha is returned.
#' @param lambda Regularization parameter for the regression,
#' see `glmnet::glmnet()`. Lambda must be a vector with length>1.
#' When a vector of lambda values is supplied, the value of lambda
#' is optimized through internal crossvalidation. It defaults to a vector
#' ranging from 10^-5 to 10^5 with 200 values in logarithmic steps.
#' The crossvalidated optimal lambda is returned.
#' @param parallelLOO If TRUE - recommended, but not the default - uses
#' `future.apply::future_lapply()` for the outer loops: `future.apply` must be
#' installed, the machine should have multiple cores available for use,
#' and threads should be defined explicitly beforehand by the user
#' (e.g. by calling `plan(multisession)`).
#' @param cv_Ncomp Whether to crossvalidate the number of components or not.
#' It defaults to NULL, but a vector can be supplied specifing the number (range) of
#' components to test in the inner loops.
#' @param cv_Ncomp_method Whether the number of components to optimize means
#' components are ordered (e.g. according to the explained variance of neuroimaging
#' data) or - somehow experimental - whether to use the N best components
#' ranked according to their relationship (in the context of a
#' univariate (g)lm) with y.
#' @param family Defaults to "gaussian." Experimental support for "binomial" on the way.
#' @param cv.type.measure The measure to minimize in crossvalidation inner loops.
#' Differently from `glmnetUtils::cva.glmnet()` the deafult is the mean absolute error.
#' @param intercept whether to fit (TRUE) or not (FALSE) an intercept to the model.
#' @param standardize Whether x must be standardized internally to glmnet.
#' @param thresh Threshold for glmnet to stop converging to the solution.
#' @param ... Other parameters passed to `glmnetUtils::cva.glmnet()`.
#'
#' @return The crossvalidated alpha and lambda parameters with the associated error.
#' @export
cv_FCnet= function(y, #dependent variable, typically behavior
x, #independent variables, typically neural measures
alpha= seq(0, 1, by= 0.1),
lambda= rev(10^seq(-5, 5, length.out = 200)),
parallelLOO= F,
cv_Ncomp= NULL,
cv_Ncomp_method= c("order", "R"),
family= optionsFCnet("family"),
type.measure= optionsFCnet("cv.type.measure"),
intercept= optionsFCnet("intercept"),
standardize= optionsFCnet("standardize"),
thresh= optionsFCnet("thresh"),
...){
cv_Ncomp_method= match.arg(cv_Ncomp_method)
if(length(lambda)==1){
stop("Scalar values for lambda have been supplied: a vector is needed.")
}
#ensure you are working with matrices
y= data.matrix(y)
if(class(x)[1]== "list"){
x= data.matrix(x$Weights)
} else {
if(class(x)[1]== "data.frame"){x= data.matrix(x)}
}
#nfold is set to loo
nfolds= nrow(x)
#initialize a vector of zeros to store coefficients
#this is in case cv of the number of components is required
cname= c(("Intercept"), colnames(x))
zeros= rep(0, ncol(x) + 1) #+1 is the intercept
#if cv_Ncomp is null, all components are tested
if(is.null(cv_Ncomp))(cv_Ncomp= ncol(x))
if(cv_Ncomp_method== "order" | length(cv_Ncomp)== 1){
test_c= lapply(cv_Ncomp, function(x)1:x)
}
if(cv_Ncomp_method== "R") {
if(family== "gaussian"){
#r= unlist(apply(x, 2, function(z){coef(lm(y~z))[2]}))
r= unlist(apply(x, 2, function(z){deviance(lm(y~z))}))
r= r*(-1)
} else {
#r= unlist(apply(x, 2, function(z){coef(glm(y~z, family= "binomial"))[2]}))
r= unlist(apply(x, 2, function(z){deviance(glm(y~z, family= "binomial"))}))
r= r*(-1)
}
rank= rank(-r, ties.method = "max")
all_x= 1:ncol(x)
test_c= lapply(cv_Ncomp, function(n){
all_x[rank<= n]
})
}
#to add here: what to do with missing values?
#crossvalidate alpha and/or lambda, but only if vectors are supplied
if (length(alpha)>1 | length(lambda)>1 | length(cv_Ncomp)>1){
if(parallelLOO== T){
Ncomp_ridge= future.apply::future_lapply(test_c, function(NC){
cva= cva.glmnet(x= x[, NC], y= y,
alpha = alpha,
lambda = lambda,
nfolds= nfolds,
grouped= ifelse(nfolds== nrow(x), F, T),
family= family,
type.measure = type.measure,
intercept= intercept,
standardize= standardize,
thresh= thresh,
keep= T,
...
)
return(cva)
}, future.seed= T)
} else {
Ncomp_ridge= lapply(test_c, function(NC){
cva= cva.glmnet(x= x[, NC], y= y,
alpha = alpha,
lambda = lambda,
nfolds= nfolds,
grouped= ifelse(nfolds== nrow(x), F, T),
family= family,
type.measure = type.measure,
intercept= intercept,
standardize= standardize,
thresh= thresh,
keep= T,
...
)
return(cva)
})
}
#collect parameters
pars= sapply(Ncomp_ridge, function(p)get_CVparsFCnet(p))
#the minimum error observed among the external loop, i.e. ncomp
min_error= which.min(as.numeric(pars[rownames(pars)== "error"]))
#the best model (alpha, etc) where the minimum error is observed
best= as.numeric(pars[rownames(pars)== "best"])[min_error]
fit= Ncomp_ridge[[min_error]]$modlist[[best]]
#now hyperparameters of the best model,
#i.e. where the minimum error was observed
lambda= as.numeric(pars[rownames(pars)== optionsFCnet("whichLambda")])[min_error]
alpha= as.numeric(pars[rownames(pars)== "alpha"])[min_error]
N_comp= as.numeric(pars[rownames(pars)== "N_comp"])[min_error]
which_comp= test_c[[min_error]]
#return coefficients - only for components that were actually tested
#the rest is set to zero
cf= coef(fit, s= lambda)[,1]
zeros[c(1, which_comp+1)] = cf
cf= zeros
coeffs= data.frame(Feature= cname,
Coefficient= cf)
rownames(coeffs)= NULL
#prevalidated CV predictions
predictions= fit$fit.preval[, which(fit$lambda== lambda)]
#for binomial, negative values are class 0, positive class 1
#predict(fit, newx = x, s= lambda, type= "class")
#return best parameters
bp= list(alpha= alpha,
lambda= lambda,
N_comp= which_comp,
fit= fit,
predictions= predictions,
y= y,
coeffs= coeffs
)
return(bp)
} #end if | vectors>1
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.