Nothing
#' Penalized Regression with Lasso and Weighted Fusion Penalties with Cross-Validation
#'
#' Uses n-fold cross-validation (CV) to fit a penalized regression model with Lasso
#' penalty and weighted fusion penalty. Return the loss of all pair of tuning parameters,
#' find the best pair of tuning parameters with the lowest loss, and estimate the regression coefficient.
#' The CV process fine-tunes the tuning parameters required for the penalty terms
#' and find the pair of lambda_1 and lambda_2 that minimizes the average validation loss.
#'
#'
#' @param data An object of class "WTsmth.data" as generated by prep()
#' @param lambda1 A numeric vector. Lambda_1 values to be considered that controls the Lasso penalty.
#' Provided values will be transformed to 2^(lambda1). The default value is c(-8:0).
#' The user can customize the range and step_size of the candidate tuning parameters
#' In most cases, the user will need to run the function more than one time to
#' adjust the range and step_size of tuning parameters
#' to locate to a reasonable range according to the `Loss` and `selected.lambda`
#' from the previous round of model fitting
#' @param lambda2 A numeric vector. Lambda_2 values to be considered that controls
#' the weighted fusion penalty.
#' Provided values will be transformed to 2^(lambda2).
#' The default value is c(-8:8).
#' The user can customize the range and step_size of the candidate tuning parameters
#' In most cases, the user will need to run the function more than one time to
#' adjust the range and step_size of tuning parameters
#' to locate to a reasonable range according to the `Loss` and `selected.lambda`
#' from the previous round of model fitting.
#' @param weight A character. The type of weighting. Must be one of
#' (`eql`, `keql`, `wcs`, `kwcs`, `wif`, `kwif`), which indicates the
#' equal weight, K x equal weight, Cosine similarity, K x cosine similarity,
#' inverse frequency, and K x inverse frequency respectively, where K is the number of
#' individuals in each CNV active region.
#' `eql` and `keql` gives equal weight to adjacent CNVs.
#' `wcs` and `kwcs` allow similar CNV fragments to have more similar effect size.
#' `wif` and `kwif` will encourage CNV with lower frequency to borrow
#' information from nearby more frequent CNV fragments.
#' Considering that CNVs usually present in some CNV-active regions and there are
#' large regions in between with no CNV at all. K will describe the number of individuals
#' having any CNV activities in a CNV-active region, and varying the weight according
#' to the sample size across regions.
#' @param family A character. The family of the outcome. Must be one of
#' "gaussian" (Y is continuous) or "binomial" (Y is binary).
#' @param cv.control A list object. Allows user to control cross-validation
#' procedure. Allowed elements are `n.fold`, the number of cross-validation
#' folds with a default value of 5, depends on the sample size,
#' it can be chosen to have other folds (such as 3, 10);
#' `n.core` is the number of cores to use in procedure,
#' check available computation resource before choosing;
#' and `stratified`, if TRUE and `family` = "binomial", the folds will be
#' stratified within each category of Y (this option is recommended if either
#' category of the outcome is "rare".)
#' @param iter.control A list object. Allows the user to control iteratively
#' update procedure. Allowed elements are `max.iter`, the maximum number
#' of iterations, it guarantees the function returns results within reasonable time;
#' `tol.beta` is the threshold below which the procedure is deemed converged,
#' which controls the absolute difference between consecutive beta updates.
#' `tol.loss` is the threshold below which the procedure
#' is deemed converged, which controls the difference in consecutive loss updates.
#'
#' @param verbose A logical object. If `TRUE`, print progression updates.
#'
#' @returns A list containing
#' 1. `Loss`: The average loss of the validation set for all pairs of candidate tuning parameters,
#' the smaller the loss, the better performance of the corresponding pair of parameters.
#' 2. `selected.lambda` :The selected tuning parameter values that minimized the loss.
#' 3. `coef` the model coefficient estimate (coef) at the selected tuning parameters.
#'
#' @include helpful_tests.R nfoldSplit.R utils.R Wtsmth_Fit.R
#' @import dplyr
#' @import doParallel
#' @import foreach
#' @importFrom tidyr spread
#'
#' @export
#'
#' @examples
#' # Note we use here a very small example data set and few candidate lambda1
#' # and lambda2 to expedite examples.
#'
#' # load toy dataset
#' data("CNVCOVY")
#'
#' # prepare data format for regression analysis
#'
#' ## Continuous outcome Y_QT
#' frag_data <- prep(CNV = CNV, Y = Y_QT, Z = Cov, rare.out = 0.05)
#' QT_tune <- cvfit_WTSMTH(frag_data,
#' lambda1 = seq(-4.75, -5.25, -0.25),
#' lambda2 = seq(18, 22, 1),
#' weight = "eql",
#' family = "gaussian")
#'
#' ## Binary outcome Y_BT
#'
#' # We can directly replace frag_data$Y with Y_BT in the correct format,
#' # ensuring that the ordering matches that of the prepared object.
#'
#' rownames(Y_BT) <- Y_BT$ID
#' frag_data$Y <- Y_BT[names(frag_data$Y), "Y"] |> drop()
#' names(frag_data$Y) <- rownames(frag_data$Z)
#'
#' # Or, we can also repeat the prep() call
#' # frag_data <- prep(CNV = CNV, Y = Y_BT, Z = Cov, rare.out = 0.05)
#'
#' BT_tune <- cvfit_WTSMTH(frag_data,
#' lambda1 = c(-5.25, -5, -4.75),
#' lambda2 = c(5, 6, 7),
#' weight = "eql",
#' family = "binomial")
#'
cvfit_WTSMTH <- function(data, lambda1 = seq(-8, 0, 1), lambda2 = seq(-8, 8, 1),
weight = NULL,
family = c("gaussian", "binomial"),
cv.control = list(n.fold = 5L,
n.core = 1L,
stratified = FALSE),
iter.control = list(max.iter = 8L,
tol.beta = 10^(-3),
tol.loss = 10^(-6)),
verbose = TRUE) {
# take the first value as default
family <- match.arg(family)
stopifnot(
"`data` must be a `WTsmth.data` object" = !missing(data) && inherits(data, "WTsmth.data"),
"`lambda1 must be a numeric vector" = !missing(lambda1) && .isNumericVector(lambda1),
"`lambda2 must be a numeric vector" = !missing(lambda2) && .isNumericVector(lambda2),
"`weight` must be one of eql, keql, wcs, kwcs, wif, kwif" =
is.null(weight) || {.isCharacterVector(weight, 1L) &&
weight %in% c("eql", "keql", "wcs", "kwcs", "wif", "kwif")},
"`cv.control` must be a list; allowed elements are n.fold, n.core, and stratified" =
.isNamedList(cv.control, c("n.fold", "n.core", "stratified")),
"`iter.control` must be a list; allowed elements are max.iter, tol.beta, and tol.loss" =
.isNamedList(iter.control, c("max.iter", "tol.beta", "tol.loss"))
)
#%%%%%%%%%%%%%%%%%%%
if (is.null(data$XZ)) {
if (is.null(weight)) stop("`weight` must be provided", call. = FALSE)
data <- .expandWTsmth(data, weight = weight)
} else {
if (!is.null(weight)) warning("`weight` input ignored; data already expanded", call. = FALSE)
}
if (family == "binomial") data$Y <- .confirmBinary(data$Y)
if (family == "gaussian") data$Y <- .confirmContinuous(data$Y)
iter.control <- .testIterControl(iter.control)
cv.control <- .testCVControl(cv.control, family)
CNV_info <- data$CNVR.info
# evaluate loss for all candidates lmd1 + lmd2
#####################K-fold cross-validation#############
# nfold split (stratified if indicated)
tr <- .nfoldSplit(Y = data$Y, unique(names(data$Y)), cv.control = cv.control)
if(cv.control$stratified == TRUE && verbose){
table(data$Y, tr) |> print()
}
loss_matrix <- matrix(0.0, nrow = length(lambda1), ncol = length(lambda2))
if (cv.control$n.core > 1L) {
cl <- parallel::makeCluster(cv.control$n.core)
doParallel::registerDoParallel(cl)
on.exit(parallel::stopCluster(cl))
}
idx <- expand.grid(seq_len(cv.control$n.fold),
seq_along(lambda1),
seq_along(lambda2))
idx_loss <- cbind(idx[, 1L], lambda1[idx[, 2L]], lambda2[idx[, 3L]])
i <- NULL # quieting CRAN warning
if(verbose){
message(" fold lambda1 lambda2")
}
loss_list <- foreach::foreach(i = seq_len(nrow(idx)),
.packages = c("CNVreg"),
.combine = "rbind") %dopar% {
if(verbose){
formatted_string <- sprintf("Values: %8.1f %8.1f %8.1f",
idx[i, 1L],
lambda1[idx[i, 2L]],
lambda2[idx[i, 3L]])
message(formatted_string)
}
subset <- tr != idx[i, 1L]
#fit model
beta_lmd21 <- fit_WTSMTH(data = data,
lambda1 = lambda1[idx[i, 2L]],
lambda2 = lambda2[idx[i, 3L]],
family = family,
iter.control = iter.control,
subset = subset)
#evaluate loss
# for continuous this will be a vector(n); for binary it will be scalar
loss <- .loss(X = data$XZ[!subset, ],
Y = data$Y[!subset],
beta = beta_lmd21$coef,
family = family)
loss
}
idx_loss <- cbind(idx_loss, loss_list) |> data.frame()
colnames(idx_loss) <- c("fold", "lambda1", "lambda2", "loss")
avg_loss <- NULL
# transform the output and take average over folds
loss_matrix <- idx_loss %>%
dplyr::group_by(lambda1, lambda2, .drop = FALSE) %>%
dplyr::summarise(
avg_loss = mean(loss),
.groups = 'drop'
) |> data.frame()
# best lmd1+lmd2 and coefficients
Mloss <- arrayInd(which.min(loss_matrix$avg_loss), length(loss_matrix$avg_loss))
# tunning parameters and beta coefficients
b_lmd1 = loss_matrix$lambda1[Mloss[1L, 1L]] #row
b_lmd2 = loss_matrix$lambda2[Mloss[1L, 1L]] #col
beta_y_cv <- fit_WTSMTH(data = data,
lambda1 = b_lmd1,
lambda2 = b_lmd2,
family = family,
iter.control = iter.control)
loss <- tidyr::spread(loss_matrix |> data.frame(), key = lambda1, value = avg_loss)
colnames(loss) <- c("Lambda2", colnames(loss)[-1L])
list("Loss" = loss,
"selected.lambda" = c(b_lmd1, b_lmd2),
"coef" = beta_y_cv)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.