#' Cross Validation for Generalized Projection to Latent Structures Regression
#'
#' @param formula a model formula
#' @param data a training data set
#' @param family the glm family. One of "gaussian", "student", "poisson", "negbinom", "binomial", "multinom", "gamma", "invgauss"
#' @param link the link function. See details for more information.
#' @param cv.method preferably one of "boot632" (the default), "cv", or "repeatedcv".
#' @param nfolds the number of bootstrap or cross-validation folds to use. defaults to 5.
#' @param nrep the number of repetitions for cv.method = "repeatedcv". defaults to 4.
#' @param crit the criterion by which to evaluate the model performance. See details for more information.
#' @param select the selection rule to use. Should be one of "best" or "oneSE" (the default).
#' @param folds a vector of pre-set cross-validation or bootstrap folds from caret::createResample or
#' caret::createFolds.
#'
#' @details The available summary statistics for the argument "crit" depend on which likelihood function is chosen for the glm family.
#' \cr \cr
#' If the outcome is multinomial, it should be one of "Kappa" (the default), "Accuracy", "Mean_F1",
#' "Mean_Sensitivity", "Mean_Specificity", "Mean_Pos_Pred_Value", "Mean_Neg_Pred_Value", "Mean_Precision", "Mean_Recall",
#' or "Mean_Detection_Rate". \cr
#' \cr
#' If the outcome is binomial, it should be one of "Spec" (Specificity, the default) or "Sens". \cr
#' \cr
#' Otherwise, it should be one of "MAE" (the default) or "MSE". \cr \cr
#' \cr
#' #' The following link functions are available for each distribution: \cr
#' \cr
#' Gaussian & Student's T: "identity" \cr
#' Binomial & Multinomial: "logit", "probit", "cauchit", "robit" (Student T with 3 df), and "cloglog" \cr
#' Poisson & Negative Binomial: "log" \cr
#' Gamma: "inverse" (1 / x) \cr
#' Inverse Gaussian: "invsquare" (1/x^2)
#'
#' @return
#' a train object
#' @export
#'
cv_gpls = function(formula, data, family = NULL, link = NULL, cv.method = "boot632", nfolds = 5, nrep = 4, crit = NULL, select = "oneSE", folds = NULL, preproc = c("center" , "scale")){
if (is.null(family)){
cat(crayon::bold(crayon::make_style(rgb(.945, .4057, .016))("warning!")), crayon::blue("Please specify a glm family from the following options:\n"), crayon::make_style(rgb(0.008, 0.341, 0.224))("'gaussian', 'gamma', 'invgauss', 'poisson', 'negbinom', 'binomial', 'multinom'"))
} else {
if (is.null(crit)){
if (family == "multinom"){
crit = "Kappa"
}
if (family == "binomial"){
crit = "Spec"
}
else {
crit = "MAE"
}
}
}
if (!is.null(link)){
if (family == "gaussian"){
if (link != "identity")
cat("link function not recognized for ", family, "only identity is available!\n")
}
else if (family=="binomial" || family == "multinom") {
tf = !is.na(match(link, c("logit", "probit", "cloglog", "cauchit", "robit")))
if (isFALSE(tf))
cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
}
else if (family=="poisson") {
if (link != "log")
cat("link function not recognized for ", family, " only log is available!\n")
}
else if (family=="gamma") {
if (link != "inverse")
cat("link function not recognized for ", family, " only inverse is available!\n")
}
else if (family=="invgauss") {
if (link != "invsquare")
cat("link function not recognized for ", family, " only invsquare is available!\n")
}
else if (family=="negbinom") {
if (link != "log")
cat("link function not recognized for ", family, " only log is available!\n")
}
}
if (family == "binomial" || family == "multinom"){
gplsCV <- list(type = "Classification",
library = "stats",
loop = NULL)
} else {
gplsCV <- list(type = "Regression",
library = "stats",
loop = NULL)
}
gplsCV$family <- family
gplsCV$link <- link
gplsCV$x.names <- colnames(model.frame(formula, data))[-1]
gplsCV$y.name <- colnames(model.frame(formula, data))[1]
gplsCV$formula
prm <- data.frame(parameter = c("ncomp"),
class = rep("numeric", 1),
label = c("ncomp"))
gplsGrid <- function(x, y, len = NULL, search = "grid", family = gplsCV$family) {
## use grid search:
if(search == "grid"){
search = "grid"
} else {
search = "grid"
}
if (family == "multinom"){
k <- length(unique(y)) - 1
ncomp = seq(1 * k, min(nrow(as.matrix(y)), ncol(x)) * k, length.out = min(nrow(as.matrix(y)), ncol(x)))
} else {
ncomp = seq(1, min(nrow(as.matrix(y)), ncol(x)), length.out = min(nrow(as.matrix(y)), ncol(x)))
}
grid <- expand.grid(ncomp = ncomp)
return(grid)
}
gplsFit <- function(x, y, param, Family = gplsCV$family, Link = gplsCV$link, ...) {
dat <- as.data.frame(x)
dat$.outcome <- y
model.out <- gpls(.outcome ~ ., data = dat, ncomp = param$ncomp, family = Family, link = Link, shrinkage = T)
model.out
}
predictgplsCV <- function(object, newdata = NULL, type = c("response", "link", "class")){
if (!inherits(object, "gpls"))
stop("object not of class gpls")
type <- match.arg(type)
family <- object$family
link <- object$link
formula <- object$formula
coefs <- coef(object)
if (is.null(newdata)){
x <- object$model.matrix
} else{
if (!is.data.frame(newdata)){
newdata <- as.data.frame(newdata)
}
x <- cbind.data.frame(rep(1, nrow(newdata)), newdata)
}
eta <- as.vector(coefs %*% t(x))
preds <- linkinv(eta, family = family, link = link)
if (type == "class"){
if (family != "binomial"){
stop("type 'predict' is only valid for predicting class labels for binomial models.")
}
else {
levs <- object$levs
classification <- preds > 0.50
labels <- ifelse(classification, levs[1], levs[2])
return(labels)
#return(as.numeric(classification))
}
}
else if (type == "link"){
return(eta)
}
else if (type == "response"){
return(preds)
}
}
predictgplsCVsoftmax <- function(model, newdata = NULL, type = c("response", "class")) {
type <- match.arg(type)
if (is.null(newdata)){
X <- model$X
} else{
X <- as.matrix(newdata)
}
beta <- model$coefficients
if(all(X[,1] == rep(1,nrow(X))))
{
eta <- X%*%beta
} else
{
eta <- cbind(rep(1,nrow(X)),X)%*% beta
}
p <- exp(eta)
p.denom <- apply(p, 1, function(x) 1+sum(x))
probabilities <- p/p.denom
probabilities <- cbind(1 - rowSums(probabilities), probabilities)
colnames(probabilities) <- model$outcome.levs
if (type == "response")
return(probabilities)
else
return(model$outcome.levs[sapply(1:nrow(probabilities), function(x) which.max(probabilities[x, ]))])
}
gplsPred <- function(modelFit, newdata, submodels = NULL){
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
if (modelFit$family == "binomial"){
predictgplsCV(modelFit, newdata, type = "class")
}
else if (modelFit$family == "multinom"){
predictgplsCVsoftmax(modelFit, newdata, type = "class")
}
else {
predictgplsCV(modelFit, newdata, type = "response")
}
}
gplsCV$grid <- gplsGrid
gplsCV$parameters <- prm
gplsCV$fit <- gplsFit
gplsCV$prob <- gplsPred
gplsCV$predict <- gplsPred
fitResamp = function(pred, obs) {
huber.mean = function(y) {
MASS::hubers(y, initmu = mean(c(mean(y), median(y))), s = sd(y))$mu
}
robmse <- huber.mean((pred - obs)^2)
robmae <- huber.mean(abs(pred - obs))
fit.measure <- c(robmse, robmae)
names(fit.measure) <- c("MSE", "MAE")
if (any(is.nan(fit.measure)))
fit.measure[is.nan(fit.measure)] <- NA
return(fit.measure)
}
fitSummary = function (data, lev = NULL, model = NULL){
if (is.character(data$obs))
data$obs <- factor(data$obs, levels = lev)
fitResamp(data[, "pred"], data[, "obs"])
}
if (family == "multinom"){
if (cv.method == "repeatedcv") {
fitControl <- trainControl(method = cv.method,
number = nfolds,
repeats = nrep,
classProbs = TRUE,
summaryFunction = multiClassSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
} else {
fitControl <- trainControl(method = cv.method,
number = nfolds,
classProbs = TRUE,
summaryFunction = multiClassSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
}
}
else if (family == "binomial"){
if (cv.method == "repeatedcv") {
fitControl <- trainControl(method = cv.method,
number = nfolds,
repeats = nrep,
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
} else {
fitControl <- trainControl(method = cv.method,
number = nfolds,
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
}
} else{
if (cv.method == "repeatedcv") {
fitControl <- trainControl(method = cv.method,
number = nfolds,
repeats = nrep,
summaryFunction = fitSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
} else {
fitControl <- trainControl(method = cv.method,
number = nfolds,
summaryFunction = fitSummary,
allowParallel = TRUE,
index = folds,
selectionFunction = select,
search = "grid")
}
}
out <- train(formula,
data,
method = gplsCV,
metric = crit,
preProcess = preproc,
trControl = fitControl)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.