### Estimate function that gives model weights based on observed inputs,
### using splines
#' Function to compute (log) weights of component models from the "predictions"
#' output by xgboost
#'
#' @param preds predictions from xgboost in matrix form, with num_models columns
#' and num_obs rows
#' @param log boolean; return log of component model weights or just the weights
#'
#' @return a matrix with num_models columns and num_obs rows, with (log) weight
#' for model m at observation i in entry [i, m]
compute_model_weights_from_f_hats <- function(
f_hats,
M = M,
J = J,
log = FALSE) {
if(J > 1) {
preds <- matrix(nrow = nrow(f_hats), ncol = M)
for(m in seq_len(M)) {
preds[, m] <- apply(f_hats[, seq_len(J) + (m - 1) * J], 1, sum)
}
} else {
preds <- f_hats
}
for(i in seq_len(nrow(preds))) {
preds[i, ] <- preds[i, ] - logspace_sum(preds[i, ])
}
if(log) {
return(preds)
} else {
return(exp(preds))
}
}
#' Compute (log) weights of component models based on a densitystack fit and
#' new data.
#'
#' @param densitystack_fit a fit xgbstack object
#' @param newdata new x data
#' @param ntreelimit how many boosting iterations worth of trees to use
#' @param log boolean: return log of weights or original weights?
#'
#' @return (log) weights in the format determined by format
#'
#' @export
compute_model_weights <- function(
density_stack_fit,
newdata,
append_newdata = FALSE,
log = FALSE) {
if(!identical(class(density_stack_fit), "density_stack")) {
stop("density_stack_fit must be an object of type density_stack!")
}
if("best_fits" %in% names(density_stack_fit)) {
## cross-validation was used to select regularization parameters
## combine weights from all models and means
model_weights <- rbind.fill(lapply(seq_along(density_stack_fit$best_fits),
function(component_fit_ind) {
compute_model_weights(
density_stack_fit = density_stack_fit$best_fits[[component_fit_ind]],
newdata = newdata,
append_newdata = TRUE,
log = log
) %>%
as.data.frame() %>%
mutate(param_set = component_fit_ind)
}))
model_weights_summarized <- model_weights %>%
group_by_(names(newdata)[names(newdata) %in% names(model_weights)]) %>%
summarize_each(
funs = rep("mean", ncol(model_weights))
)
colname_inds_to_replace <- which(substr(colnames(model_weights_summarized),
nchar(colnames(model_weights_summarized)) - 4,
nchar(colnames(model_weights_summarized))) == "_mean")
colnames(model_weights_summarized)[colname_inds_to_replace] <-
substr(colnames(model_weights_summarized)[colname_inds_to_replace],
1,
nchar(colnames(model_weights_summarized)[colname_inds_to_replace]) - 5)
model_weights_summarized$param_set <- "combined"
model_weights <- rbind(model_weights,
model_weights_summarized
)
if(!append_newdata) {
model_weights <- model_weights[,
!(colnames(model_weights) %in% colnames(newdata)),
drop = FALSE]
}
} else {
## convert newdata to matrix format
newdata_df <- Formula::model.part(density_stack_fit$formula,
data = newdata,
rhs = 1)
newdata_matrix <- newdata_df %>%
as.matrix() %>%
`storage.mode<-`("double")
## get fitted values
M <- ncol(density_stack_fit$model_scores)
J <- ncol(density_stack_fit$dtrain)
f_hat <- matrix(NA,
nrow = nrow(newdata_matrix),
ncol = M * J)
m_j_pairs <- expand.grid(
m = seq_len(M),
j = seq_len(J),
stringsAsFactors = FALSE
)
for(update_ind in seq_len(nrow(m_j_pairs))) {
m <- m_j_pairs$m[update_ind]
j <- m_j_pairs$j[update_ind]
if("lm" %in% class(density_stack_fit$smoother_fits[[update_ind]])) {
f_hat[, j + (m - 1) * J] <- predict(
density_stack_fit$smoother_fits[[update_ind]],
newdata = data.frame(
x = newdata_matrix[, j]))
} else if("smooth.spline" %in% class(density_stack_fit$smoother_fits[[update_ind]])) {
f_hat[, j + (m - 1) * J] <- predict(
density_stack_fit$smoother_fits[[update_ind]],
x = newdata_matrix[, j])$y
}
}
## convert to weights
model_weights <- compute_model_weights_from_f_hats(f_hat,
M = M,
J = J,
log = log)
## set column names
colnames(model_weights) <-
strsplit(as.character(density_stack_fit$formula)[2], " + ", fixed = TRUE)[[1]]
## append newdata
if(append_newdata) {
model_weights <- cbind(
newdata_df,
model_weights
)
}
}
## return
return(model_weights)
}
log_lik_score <- function(log_model_weights, log_model_scores) {
## adding log_weights and component_model_log_scores gets
## log(pi_mi) + log(f_m(y_i | x_i)) = log(pi_mi * f_m(y_i | x_i))
## in cell [i, m] of the result. logspace sum the rows to get a vector with
## log(sum_m pi_mi * f_m(y_i | x_i)) in position i.
## sum that vector to get the objective.
return(sum(logspace_sum_matrix_rows(log_model_weights + log_model_scores)))
}
#' A factory-esque arrangement (not sure if there is an actual name for this
#' pattern) to manufacture an function to calculate first and second order
#' derivatives of the log-score objective, with needed quantities
#' accessible in its parent environment. We do this becaues there's no way to
#' use the info attribute of the dtrain object to store the component model
#' log scores (as would be standard in the xgboost package). But we need to
#' ensure that the objective function has access to these log scores when
#' called in parallel.
#'
#' @param component_model_log_scores N by M matrix where entry [i, m] has the
#' log-score for observation i obtained from component model m
#'
#' @return a function that takes two arguments (preds and dtrain) and computes
#' the first and second order derivatives of the log-score objective function
#' for the stacked model. i.e., it converts preds to model weights for each
#' component model m at each observation i and then combines the component
#' model log scores with those weights to obtain stacked model log scores.
#' See the package vignette for derivations of these calculations. This
#' function is suitable for use as the "obj" function in a call to
#' xgboost::xgb.train
compute_working_weights_and_response <- function(
f_hat_old,
component_model_log_scores,
dtrain,
m,
M,
j,
J,
min_obs_weight = 1) {
## Compute log of component model weights at each observation
log_weights <- compute_model_weights_from_f_hats(f_hat_old,
M = M,
J = J,
log = TRUE)
## adding log_weights and component_model_log_scores gets
## log(pi_mi) + log(f_m(y_i | x_i)) = log(pi_mi * f_m(y_i | x_i))
## in cell [i, m] of the result. logspace sum the rows to get a vector with
## log(sum_m pi_mi * f_m(y_i | x_i)) in position i.
log_weighted_scores <- log_weights + component_model_log_scores
log_weighted_score_sums <- logspace_sum_matrix_rows(log_weighted_scores)
## calculate log of pi_{m} f_{m}(y_i | x_i) / f(y_i | x_i)
log_prop_dens_from_m <- log_weighted_scores[, m] - log_weighted_score_sums
## calculate log of vector g from vignette, where
## g[i] = pi_{m} f_{m}(y_i | x_i) / f(y_i | x_i) - pi_{m}
## g[i] may be either positive or negative, so keep track of sign and log(|g|)
temp <- cbind(log_prop_dens_from_m, log_weights[, m])
sign_g <- sign(apply(temp, 1, function(temp_row) {temp_row[1] - temp_row[2]}))
log_g <- vector(mode = "numeric", length = nrow(temp))
pos_inds <- (sign_g > 0)
if(sum(pos_inds) > 0) {
log_g[pos_inds] <- logspace_sub_matrix_rows(temp[pos_inds, , drop = FALSE])
}
if(sum(!pos_inds) > 0) {
log_g[!pos_inds] <-
logspace_sub_matrix_rows(temp[!pos_inds, c(2, 1), drop = FALSE])
}
## calculate log of observation weights from vignette, where
## w[i] = [ pi_{m} f_{m}(y_i | x_i) / f(y_i | x_i) ]^2 + pi_{m}
## - [ pi_{m} f_{m}(y_i | x_i) / f(y_i | x_i) + pi_{m}^2 ]
## w[i] may be either positive or negative, so keep track of sign and log(|w|)
term_1 <- logspace_sum_matrix_rows(
cbind(
2 * log_prop_dens_from_m,
log_weights[, m]
)
)
term_2 <- logspace_sum_matrix_rows(
cbind(
log_prop_dens_from_m,
2 * log_weights[, m]
)
)
temp <- cbind(term_1, term_2)
sign_w <- sign(apply(temp, 1, function(temp_row) {temp_row[1] - temp_row[2]}))
log_w <- vector(mode = "numeric", length = nrow(temp))
pos_inds <- (sign_w > 0)
if(sum(pos_inds) > 0) {
log_w[pos_inds] <- logspace_sub_matrix_rows(temp[pos_inds, , drop = FALSE])
}
if(sum(!pos_inds) > 0) {
log_w[!pos_inds] <- logspace_sub_matrix_rows(temp[!pos_inds, c(2, 1), drop = FALSE])
}
## truncate below -- at this point, all sign_w are +1, so we ignore that
log_w[(pos_inds & (log_w < log(min_obs_weight))) || !pos_inds] <-
log(min_obs_weight)
## calculate working response, see vignette
working_response <- sign_g * exp(log_g - log_w) + f_hat_old[, j + (m - 1) * J]
## return
return(list(w = exp(log_w), y = working_response))
}
#' Fit a stacking model given a measure of performance for each component model
#' on a set of training data, and a set of covariates to use in forming
#' component model weights
#'
#' @param formula a formula describing the model fit. left hand side should
#' give columns in data with scores of models, separated by +. right hand
#' side should specify explanatory variables on which weights will depend.
#' @param data a data frame with variables in formula
#' @param booster what form of boosting to use? see xgboost documentation
#' @param subsample fraction of data to use in bagging. not supported yet.
#' @param colsample_bytree fraction of explanatory variables to randomly select
#' in growing each regression tree. see xgboost documentation
#' @param colsample_bylevel fraction of explanatory variables to randomly select
#' in growing each level of the regression tree. see xgboost documentation
#' @param max_depth maximum depth of regression trees. see xgboost documentation
#' @param min_child_weight not recommended for use. see xgboost documentation
#' @param eta learning rate. see xgboost documentation
#' @param gamma Penalty on number of regression tree leafs. see xgboost documentation
#' @param lambda L2 regularization of contribution to model weights in each
#' round. see xgboost documentation
#' @param alpha L1 regularization of contribution to model weights in each round.
#' see xgboost documentation
#' @param nrounds see xgboost documentation
#' @param cv_params optional named list of parameter values to evaluate loss
#' via cross-validation. Each component is a vector of parameter values with
#' name one of "booster", "subsample", "colsample_bytree",
#' "colsample_bylevel", "max_depth", "min_child_weight", "eta", "gamma",
#' "lambda", "alpha", "nrounds"
#' @param cv_folds list specifying observation groups to use in cross-validation
#' each list component is a numeric vector of observation indices.
#' @param cv_nfolds integer specifying the number of cross-validation folds to
#' use. if cv_folds was provided, cv_nfolds is ignored. if cv_folds was not
#' provided, the data will be randomly partitioned into cv_nfolds groups
#' @param cv_refit character describing which of the models specified by the
#' values in cv_params to refit using the full data set. Either "best",
#' "ttest", or "none".
#' @param update an object of class xgbstack to update
#' @param nthread number of threads to use
#' @param verbose how much output to generate along the way. 0 for no logging,
#' 1 for some logging
#'
#' @return an estimated xgbstack object, which contains a gradient tree boosted
#' fit mapping observed variables to component model weights
#'
#' @export
density_stack_fixed_regularization <- function(formula,
data,
smoothers = "smooth.spline",
df = NULL,
spar = NULL,
lambda = NULL,
min_obs_weight = 1,
tol = 10^{-5},
maxit = 10^5,
verbose = 0) {
formula <- Formula::Formula(formula)
## response, as a matrix of type double
model_scores <- Formula::model.part(formula, data = data, lhs = 1) %>%
as.matrix() %>%
`storage.mode<-`("double")
## predictors, as a matrix of type double
dtrain <- Formula::model.part(formula, data = data, rhs = 1) %>%
as.matrix() %>%
`storage.mode<-`("double")
## number of models, observations, covariates
M <- ncol(model_scores)
N <- nrow(dtrain)
J <- ncol(dtrain)
## list of arguments to smooth.spline(), suitable for use with do.call()
## dtrain and temp are objects that will have been intantiated at the time
## smooth.spline() is called below.
smooth.spline_args <- list(
x = quote(dtrain[, j]),
y = quote(temp$y),
w = quote(temp$w),
cv = NA,
keep.data = FALSE
)
## list of arguments to lm(), suitable for use with do.call()
## dtrain and temp are objects that will have been intantiated at the time
## smooth.spline() is called below.
lm_args <- list(
formula = y ~ 0 + x,
weights = quote(temp$w),
data = quote(
data.frame(
x = dtrain[, j],
y = temp$y))
)
## clean up regularization parameter and add to smooth.spline_args
if(!missing(df) && !is.null(df)) {
df <- as.numeric(df)
if(identical(length(df), 1L)) {
df <- rep(df, J)
smooth.spline_args <- c(smooth.spline_args,
df = quote(df[j]))
smooth.spline_args$cv <- FALSE
} else if(!identical(length(df), J)) {
stop("If provided, df must be a numeric vector of length 1 or J where J is the number of predictive covariates.")
}
} else if(!missing(lambda) && !is.null(lambda)) {
lambda <- as.numeric(lambda)
if(identical(length(lambda), 1L)) {
lambda <- rep(lambda, J)
smooth.spline_args <- c(smooth.spline_args,
lambda = quote(lambda[j]))
} else if(!identical(length(lambda), J)) {
stop("If provided, lambda must be a numeric vector of length 1 or J where J is the number of predictive covariates.")
}
} else {
if(missing(spar) || is.null(spar)) {
warning("No regularization parameters included: setting spar = 0.5!")
spar <- 0.5
} else {
spar <- as.numeric(spar)
}
if(identical(length(spar), 1L)) {
spar <- rep(spar, J)
smooth.spline_args <- c(smooth.spline_args,
spar = quote(spar[j]))
} else if(!identical(length(spar), J)) {
stop("If provided, spar must be a numeric vector of length 1 or J where J is the number of predictive covariates.")
}
}
if(identical(length(smoothers), 1L)) {
smoothers <- rep(smoothers, J)
} else if(!identical(length(smoothers), as.integer(J))) {
stop("Argument 'smoothers' must be a character vector of length 1 or J where J is the number of predictive covariates.")
}
## get fit
f_hat_new <- matrix(0, nrow = N, ncol = M * J)
f_hat_ssd <- tol + 1
m_j_pairs <- expand.grid(
m = seq_len(M),
j = seq_len(J),
stringsAsFactors = FALSE
)
it_ind <- 1L
while(f_hat_ssd > tol && it_ind <= maxit) {
## store spline fit values from previous iteration
f_hat_old <- f_hat_new
for(update_ind in seq_len(nrow(m_j_pairs))) {
m <- m_j_pairs$m[update_ind]
j <- m_j_pairs$j[update_ind]
## Fit smoother for model m, covariates j
temp <- compute_working_weights_and_response(
f_hat_old = f_hat_old,
component_model_log_scores = model_scores,
dtrain = dtrain,
m = m,
M = M,
j = j,
J = J,
min_obs_weight = min_obs_weight)
if(identical(unname(smoothers[j]), "smooth.spline")) {
smoother_fit_m_j <- do.call("smooth.spline", smooth.spline_args)
## fitted values used to update f_hat_new
f_hat_new[, j + (m - 1) * J] <-
predict(smoother_fit_m_j, x = dtrain[, j])$y
} else if(identical(unname(smoothers[j]), "lm")) {
smoother_fit_m_j <- do.call("lm", lm_args)
## fitted values used to update f_hat_new
f_hat_new[, j + (m - 1) * J] <- fitted(smoother_fit_m_j)
} else {
stop("Invalid smoother type: currently, only 'smooth.spline' and 'lm' are supported.")
}
}
## sum of squared differences in fitted spline values between iterations
## comparison to tol is stopping criterion
f_hat_ssd <- sum((f_hat_new - f_hat_old)^2)
if(verbose >= 1) {
message(paste0("Iteration ", it_ind, "; f_hat_ssd = ", f_hat_ssd, "\n"))
}
## update iteration index
## comparison to maxit is stopping criterion
it_ind <- it_ind + 1L
}
if(it_ind == maxit) {
warning("Reached maximum number of iterations without convergence.")
}
## one last update to get final spline fits to save/return
smoother_fits <- lapply(seq_len(nrow(m_j_pairs)),
function(update_ind) {
m <- m_j_pairs$m[update_ind]
j <- m_j_pairs$j[update_ind]
## Fit smoother for model m, covariates j
temp <- compute_working_weights_and_response(
f_hat_old = f_hat_old,
component_model_log_scores = model_scores,
dtrain = dtrain,
m = m,
M = M,
j = j,
J = J,
min_obs_weight = min_obs_weight)
if(identical(unname(smoothers[j]), "smooth.spline")) {
smoother_fit_m_j <- do.call("smooth.spline", smooth.spline_args)
} else if(identical(unname(smoothers[j]), "lm")) {
smoother_fit_m_j <- do.call("lm", lm_args)
}
return(smoother_fit_m_j)
})
## return
return(structure(
list(smoother_fits = smoother_fits,
formula = formula,
model_scores = model_scores,
dtrain = dtrain,
df = df,
spar = spar,
lambda = lambda),
class = "density_stack"
))
}
#' Fit a stacking model given a measure of performance for each component model
#' on a set of training data, and a set of covariates to use in forming
#' component model weights. Optionally, use cross-validation to select
#' amount of regularization.
#'
#' @export
density_stack <- function(formula,
data,
smoothers = "smooth.spline",
df = NULL,
spar = NULL,
lambda = NULL,
min_obs_weight = 0,
tol = 10^{-5},
maxit = 10^5,
cv_folds = NULL,
cv_nfolds = 10L,
cv_refit = "ttest",
verbose = 0) {
## keep only one set of regularization parameters
## prefer spar, then lamba, then df
if(!missing(spar) && !is.null(spar)) {
if(!missing(df) || !missing(lambda)) {
warning("Multiple sets of regularization parameters provided -- using only spar")
df <- NULL
lambda <- NULL
}
} else if(!missing(lambda) && !is.null(lambda)) {
if(!missing(df)) {
warning("Multiple sets of regularization parameters provided -- using only lambda")
df <- NULL
}
} else if(missing(df)) {
warning("No regularization parameter provided -- setting spar = 0.5!")
spar <- 0.5
}
## list of arguments suitable for use in a call to
## density_stack_fixed_regularization() via do.call()
density_stack_fixed_reg_params <- list(
formula = formula,
min_obs_weight = min_obs_weight,
tol = tol,
maxit = maxit,
verbose = verbose - 1L)
## get data, names of covariates
formula_as_Formula <- Formula::Formula(formula)
## response, as a matrix of type double
model_scores <- Formula::model.part(formula_as_Formula, data = data, lhs = 1) %>%
as.matrix() %>%
`storage.mode<-`("double")
## predictors, as a matrix of type double -- but only the first row
dtrain <- Formula::model.part(formula_as_Formula, data = data[1, , drop = FALSE], rhs = 1) %>%
as.matrix() %>%
`storage.mode<-`("double")
## number of models, observations, covariates
M <- ncol(model_scores)
N <- nrow(model_scores)
J <- ncol(dtrain)
covar_names <- colnames(dtrain)
## clean up smoothers argument
## ensure that either
## (a) a smoother was provided for all covariates, or
## (b) only one smoother was provided, to be used for all covariates
if(!is.null(smoothers)) {
if(length(smoothers) > 1) {
if(!all(covar_names %in% names(smoothers))) {
stop("Smoothers must be either a character vector of length one or a named character vector whose names include all covariates in right-hand side of formula: ", paste(covar_names, sep = ", "))
}
## pull out only relevant parameter values, in correct order
smoothers <- smoothers[covar_names]
}
## add argument to density_stack_fixed_reg_params
density_stack_fixed_reg_params$smoothers <- smoothers
}
if(ifelse(is.data.frame(df), nrow(df) > 1, FALSE) ||
ifelse(is.data.frame(spar), nrow(spar) > 1, FALSE) ||
ifelse(is.data.frame(lambda), nrow(lambda) > 1, FALSE)) {
## cross-validation for regularization parameter selection
## if they weren't provided, get sets of observations for cv folds
## otherwise, set cv_nfolds = number of folds provided
if(is.null(cv_folds)) {
cv_fold_memberships <- cut(seq_len(nrow(data)), cv_nfolds) %>%
as.numeric() %>%
sample(size = nrow(data), replace = FALSE)
cv_folds <- lapply(seq_len(cv_nfolds),
function(fold_ind) {
which(cv_fold_memberships == fold_ind)
}
)
} else {
cv_nfolds <- length(cv_folds)
}
## ensure that either
## (a) regularization parameter values were provided for all covariates, or
## (b) only one set of regularization parameter values was provided,
## to be used for all covariates
if(!is.null(df)) {
if(ncol(df) > 1) {
if(!all(covar_names %in% colnames(df))) {
stop("Cross-validation parameter grid must have only one column or include values for all covariates in right-hand side of formula: ", paste(covar_names, sep = ", "))
}
## pull out only relevant parameter values, in correct order
df <- df[, covar_names]
}
## add argument to density_stack_fixed_reg_params
density_stack_fixed_reg_params$df <- quote(df[cv_ind, ])
} else if(!is.null(spar)) {
if(ncol(spar) > 1) {
if(!all(covar_names %in% colnames(spar))) {
stop("Cross-validation parameter grid must have only one column or include values for all covariates in right-hand side of formula: ", paste(covar_names, sep = ", "))
}
## pull out only relevant parameter values, in correct order
spar <- spar[, covar_names]
}
## add argument to density_stack_fixed_reg_params
density_stack_fixed_reg_params$spar <- quote(spar[cv_ind, ])
} else if(!is.null(lambda)) {
if(ncol(lambda) > 1) {
if(!all(covar_names %in% colnames(lambda))) {
stop("Cross-validation parameter grid must have only one column or include values for all covariates in right-hand side of formula: ", paste(covar_names, sep = ", "))
}
## pull out only relevant parameter values, in correct order
lambda <- lambda[, covar_names]
}
## add argument to density_stack_fixed_reg_params
density_stack_fixed_reg_params$lambda <- quote(lambda[cv_ind, ])
}
## construct data frame with all combinations of parameter value settings
cv_results <- expand.grid(c(df, spar, lambda), stringsAsFactors = FALSE)
num_reg_param_cols <- ncol(cv_results)
# ## if update was provided, subset to only rows specifying new parameter
# ## combinations (don't re-estimate cv results already in update)
# if(update_same_data_and_cv) {
# cv_results <- cv_results %>%
# anti_join(update$cv_results, by = names(base_params)[names(base_params) != "nthread"])
# }
## space for cv results
cv_results <- cbind(cv_results,
matrix(NA, nrow = nrow(cv_results), ncol = cv_nfolds + 1) %>%
`colnames<-`(
c(paste0("cv_log_score_fold_", seq_len(cv_nfolds)), "cv_log_score_mean")
)
)
density_stack_fixed_reg_params$data <-
quote(data[-cv_folds[[k]], , drop = FALSE])
for(cv_ind in seq_len(nrow(cv_results))) {
if(verbose >= 1) {
message(paste0("Fitting cv model ",
cv_ind,
" of ",
nrow(cv_results)
))
}
## for each k = 1, ..., cv_nfolds,
## a) get fit leaving out fold k
## b) get log score for fold k (possibly for multiple values of nrounds)
cv_results[cv_ind, paste0("cv_log_score_fold_", seq_len(cv_nfolds))] <-
foreach(k = seq_len(cv_nfolds), .combine = c) %dopar% {
# for(k in seq_len(cv_nfolds)) { # REPLACE WITH FOREACH LATER
## step a) get fit leaving out fold k
stacking_fit_k <- do.call("density_stack_fixed_regularization",
density_stack_fixed_reg_params)
## step b) get log score for fold k (val for validation)
log_lik_score(
log_model_weights = compute_model_weights(stacking_fit_k,
newdata = data[cv_folds[[k]], covar_names, drop = FALSE],
append_newdata = FALSE,
log = TRUE),
log_model_scores = model_scores[cv_folds[[k]], , drop = FALSE])
} # end code to get log score for each k-fold (foreach)
} # end code to get log score for each parameter combination
cv_results$cv_log_score_mean <-
apply(cv_results[, paste0("cv_log_score_fold_", seq_len(cv_nfolds))],
1,
mean)
# ## if update was provided, merge cv_results with previous cv_results
# if(update_same_data_and_cv) {
# cv_results <- cv_results %>%
# bind_rows(update$cv_results)
# }
## get fit with all training data based on selected parameters
density_stack_fixed_reg_params$data <- quote(data)
best_params_ind <- which.max(cv_results$cv_log_score_mean)
reg_param_name <- names(density_stack_fixed_reg_params)[
names(density_stack_fixed_reg_params) %in% c("df", "spar", "lambda")
]
if(identical(cv_refit, "best")) {
## fit based on the single set of parameter values with best performance
## best ind has highest log score
density_stack_fixed_reg_params[[reg_param_name]] <-
quote(cv_results[best_params_ind, seq_len(num_reg_param_cols)])
density_stack_fit <- do.call("density_stack_fixed_regularization",
density_stack_fixed_reg_params)
density_stack_fit$cv_results <- cv_results
} else if(identical(cv_refit, "ttest")) {
## fits based on all sets of parameter values with cv performance
## not different from the cv performance of the best parameter set
## according to a paired t test looking at results from all cv folds
## best ind has highest log score
refit_params_inds <- sapply(seq_len(nrow(cv_results)),
function(ind) {
t.test(
x = as.numeric(cv_results[ind,
paste0("cv_log_score_fold_", seq_len(cv_nfolds))]),
y = as.numeric(cv_results[best_params_ind,
paste0("cv_log_score_fold_", seq_len(cv_nfolds))]),
paired = TRUE
)$p.value >= 0.05
})
## NA's may result if CV prediction log scores are the same for the model
## specified by ind as for the model specified by best_params_ind (and so
## NA is guaranteed at at least the index of best_params_ind
refit_params_inds[is.na(refit_params_inds)] <- TRUE
params <- cv_results[refit_params_inds,
seq_len(ncol(cv_results) - cv_nfolds - 1), drop = FALSE]
density_stack_fixed_reg_params[[reg_param_name]] <-
quote(cv_results[k, seq_len(num_reg_param_cols)])
density_stack_fits_best_params <-
foreach(k = seq_len(nrow(params))) %dopar% {
do.call("density_stack_fixed_regularization",
density_stack_fixed_reg_params)
}
density_stack_fit <- structure(
list(best_fits = density_stack_fits_best_params,
params = params,
cv_results = cv_results,
formula = formula,
model_scores = model_scores,
dtrain = dtrain,
df = df,
spar = spar,
lambda = lambda),
class = "density_stack"
)
} else if(identical(cv_refit, "none")) {
fit <- NULL
} else {
warning("Invalid option for cv_refit: must be one of 'best', 'ttest', or 'none'; treating as 'none'")
fit <- NULL
}
} else {
## no cross-validation for parameter selection
## get fit with all training data based on provided parameters
density_stack_fit <- density_stack_fixed_regularization(
formula = formula,
smoothers = smoothers,
data = data,
df = df,
spar = spar,
lambda = lambda,
min_obs_weight = min_obs_weight,
tol = tol,
maxit = maxit,
verbose = verbose)
}
return(density_stack_fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.