#' Two-stage log-ratio lasso
#'
#' Fits the two-stage log-ratio lasso.
#'
#' @param z The input matrix. Assumed to be the logarithmic transform of positive raw inputs.
#' @param y The real-valued response.
#' @param k_max The largest number of log-ratios to consider.
#' @param lambda_1 Optional vector of lambda values.
#' @param second.stage Specifies whether to fit the second stage on the response y or the fitted values yhat.
#' Fitting on yhat is known as "conservative two-stage" in the log-ratio lasso paper.
#' @param ... Additional arguments passed to "glmnet.constr".
#' @return beta A list of p by k_max matrices where column j gives the coefficient value after j steps.
#' each list entry corresponds to a value of lambda.
#' @return coef. The theta coefficients selected for each model. Recommended for internal use only.
#' @return lambda The grid of lambda values used for the fitting.
#' @return selected_vars A matrix showing which log-terms are active at each step of the second-stage
#' forward stepwise procedure.
two_stage <- function(z, y, k_max = 10, lambda_1 = NULL, second.stage = "y", family = "gaussian", nlambda = 20, ...) {
# executes two-stage procedure.
# assumes centered response variable.
p <- ncol(z)
constrained_fit <- glmnet.constr(z, y, family = family, lambda = lambda_1, nlambda = nlambda, ...)
lambda_1 <- constrained_fit$lambda
betas <- constrained_fit$beta
#print(betas)
#print(lambda_1)
selected_vars <- apply(betas, 2, function(x){(abs(x) > 0)})
output <- list()
for (i in 1:length(lambda_1)) {
if(sum(selected_vars[, i]) > 0) {
#print(paste0("which: i",i))
#print(selected_vars)
expanded_set <- small_to_big_z(z, which(selected_vars[, i]))
y_hat <- z %*% betas[, i]
resp <- y
if(second.stage == "yhat") {
resp <- y_hat
}
d <- sum(selected_vars[, i]) - 1
k <- min(d, k_max)
suppressWarnings(output[[i]] <- custom_fs(expanded_set, resp, k, selected_vars[, i], p, family = family))
#print(output[[i]])
} else {
output[[i]] <- NA
}
}
betas <- lapply(output, function(x) {out_to_beta(x, k_max, p)})
list(betas = betas, coef = output, lambda = lambda_1, selected_vars = selected_vars)
}
out_to_beta <- function(obj, k, p) {
#takes output of "two_stage$coef[[i]]" and returns p by k matrix of beta coefficients
#internal helper function
betas <- matrix(0, p, k)
if(sum(is.na(obj)) > 0) {
return(betas)
} else if (class(obj) != "list") {
return(betas)
}
#print(paste0("obj", obj))
d <- min(k, nrow(obj$theta_ind))
#print(d)
#print(k)
if(d == 0) {
return(betas)
}
# print(obj$subset)
# print(obj$theta_ind)
# print(paste0("d:", d))
# print(dim(betas))
# print(paste0("p:", p))
for(i in 1:d) {
beta <- rep(0, p)
# print(i)
# print(obj$subset[obj$theta_ind[i,1]])
# print(obj$subset[obj$theta_ind[i,2]])
# print(obj$theta_vals[[i]])
for(j in 1:i){
beta[obj$subset[obj$theta_ind[j,1]]] <- beta[obj$subset[obj$theta_ind[j,1]]] +
obj$theta_vals[[i]][j]
beta[obj$subset[obj$theta_ind[j,2]]] <- beta[obj$subset[obj$theta_ind[j,2]]] -
obj$theta_vals[[i]][j]
}
# print(paste0("i completed"))
betas[, i] <- beta
}
#fill in remaining entries with same beta
#print(d)
if(d < k) {
for(i in (d+1):k) {
betas[, i] <- betas[, d]
}
}
#print(betas)
betas
}
predict_two_stage <- function(z, y, new_z, family = "gaussian", lambda_1 = NULL, k_max = 5, nlambda = 20, ...) {
# forms predictions on new data from two_stage model
# output is list of size length(lambda_1)
# each entry is matrix of length ncol(new_z) and width k_max
# intended for internal use with "cv_two_stage" functions
p <- ncol(z)
fit <- two_stage(z, y, lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, family = family, ...)
#print(fit$coef)
betas <- lapply(fit$coef, function(obj){out_to_beta(obj, k_max, p)})
predictor <- function(betas) {
if(!is.null(ncol(betas))) {
return(apply(betas,2,function(b){new_z %*% b}))
}
return(matrix(rep(new_z %*% betas, k_max), ncol = k_max, byrow = FALSE))
}
y_pred <- lapply(betas, predictor)
#list of n_pred by k matrix of predictions
out <- list(two_step_obj = fit, y_pred = y_pred)
out
}
#' CV for two-stage
#'
#' Cross-validation for the lambda and k parameters in the two-stage log-ratio lasso.
#'
#' @param z The input matrix. Assumed to be the logarithmic transform of positive raw inputs.
#' @param y The real-valued response.
#' @param k_max The largest number of log-ratios to consider.
#' @param lambda_1 Optional vector of lambda values.
#' @param n_folds Optional number of folds. Default is 10.
#' @param ... Additional arguments passed to "two_stage"
#' @return mse A matrix of MSE values generated by CV.
#' @return best_params The best lambda and k parameter values chosen by CV.
#' @return lambda_min A best value of lambda.
#' @return k_min A best value of k.
#' @return two_step_obj The two_step object fit on the full data.
#' @return beta_min The parameter value selected by CV.
#' @return lambda The grid of lambda values used.
cv_two_stage <- function(z, y, family = "gaussian", lambda_1 = NULL, k_max = 10,
n_folds = 10, nlambda = 20, folds = NULL, ...) {
p <- ncol(z)
#returns list of length lambda of vectors of size k_max with the prediction error.
two_step_obj <- two_stage(z, y, family = family, lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, ...)
lambda_1 <- two_step_obj$lambda
#Create equally size folds
if(is.null(folds)) {
folds <- sample(cut(seq(1,nrow(z)),breaks=n_folds,labels=FALSE))
} else {
n_folds = max(folds) - min(folds) + 1
}
#Perform k-fold cross validation
mse <- list()
for(i in 1:n_folds){
print(paste0("Starting CV fold ", i))
#Segement your data by fold using the which() function
test_indices <- which(folds==i,arr.ind=TRUE)
out <- predict_two_stage(z[-test_indices, ], y[-test_indices], new_z = z[test_indices, ],
lambda_1 = lambda_1, k_max = k_max, nlambda = nlambda, family = family, ...)
if(family == "gaussian") {
mse_fun <- function(y_pred) {
if(!is.null(ncol(y_pred))) {
if(ncol(y_pred > 1)) {
return(apply(y_pred,2,function(yp){sum((yp - y[test_indices])**2)}))
}
}
sum((yp - y[test_indices])**2)
}
} else if (family == "binomial") {
mse_fun <- function(y_pred) {
if(!is.null(ncol(y_pred))) {
if(ncol(y_pred > 1)) {
return(apply(y_pred,2,function(yp){sum(log(1 + exp(-(2*y[test_indices] - 1) * y_pred)))}))
}
}
sum(log(1 + exp(-(2*y[test_indices] - 1) * y_pred)))
}
}
mse[[i]] <- lapply(out$y_pred, mse_fun)
#glmnet may return fewer than nlambda entries if convergence fails
while(length(mse[[i]]) < nlambda) {
mse[[i]][[length(mse[[i]]) + 1]] = rep(Inf, k)
}
}
mse_full <- lapply(mse[[1]], function(x){x / n_folds})
#print(length(mse_full))
for(i in 2:n_folds) {
for(j in 1:length(mse[[1]])) {
mse_full[[j]] <- mse_full[[j]] + mse[[i]][[j]] / n_folds
}
}
mse_full <- simplify2array(mse_full)
best <- which(mse_full == min(mse_full), arr.ind = TRUE)
lambda_min <- best[1,2]
k_min <- best[1,1]
beta_min <- out_to_beta(two_step_obj$coef[[lambda_min]], k_max, p)[, k_min]
list(mse = mse_full, best_params = best, lambda_min = lambda_min, k_min = k_min,
two_step_obj = two_step_obj, beta_min = beta_min, lambda = lambda_1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.