Description Usage Arguments Details Value Author(s) References See Also Examples
k-fold cross-validation for FuncompCGL; produce a plot and return
optimal values of lam
and k
.
1 2 3 |
y |
response vector with length n. |
X |
a data frame or matrix.
|
Zc |
a n*p_c design matrix of unpenalized variables. Default is NULL. |
lam |
a user supplied lambda sequence.
If |
nlam |
the length of the |
k |
a vector of integer values of the degrees of freedom; default is 4:10. |
ref |
reference level (baseline), either an integer between [1,p] or
|
foldid |
an optional vector of values between 1 and the sample size |
nfolds |
number of folds, default is 10. The smallest allowable value is |
W |
a vector of length p (the total number of groups),
or a matrix with dimension p1*p1, where
|
trim |
percentage to be trimmed off the prediction errors from either side; default is 0. |
outer_maxiter |
maximum number of loops allowed for the augmented Lagrange method. |
keep |
If |
... |
other arguments that can be passed to |
k-fold cross validation.
An object of S3 class "cv.FuncompCGL"
is return, which is a list
containing:
FuncompCGL.fit |
a list of length |
lam |
the sequence of |
Ftrim |
a list for cross validation results with trim = 0.
|
Ttrim |
a list of cross validation result with |
fit.preval, foldid |
|
Zhe Sun and Kun Chen
Sun, Z., Xu, W., Cong, X., Li G. and Chen K. (2020) Log-contrast regression with functional compositional predictors: linking preterm infant's gut microbiome trajectories to neurobehavioral outcome, https://arxiv.org/abs/1808.02403 Annals of Applied Statistics
FuncompCGL
and GIC.FuncompCGL
,
and predict
, coef
and plot
methods for "cv.FuncompCGL"
object.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | ## generate training and testing data
df_beta = 5
p = 30
beta_C_true = matrix(0, nrow = p, ncol = df_beta)
beta_C_true[1, ] <- c(-0.5, -0.5, -0.5 , -1, -1)
beta_C_true[2, ] <- c(0.8, 0.8, 0.7, 0.6, 0.6)
beta_C_true[3, ] <- c(-0.8, -0.8 , 0.4 , 1 , 1)
beta_C_true[4, ] <- c(0.5, 0.5, -0.6 ,-0.6, -0.6)
n_train = 50
n_test = 30
nfolds = 5
foldid <- sample(rep(seq(nfolds), length = n_train))
k_list <- c(4,5)
Data <- Fcomp_Model(n = n_train, p = p, m = 0, intercept = TRUE,
SNR = 4, sigma = 3, rho_X = 0.2, rho_T = 0.5,
df_beta = df_beta, n_T = 20, obs_spar = 1, theta.add = FALSE,
beta_C = as.vector(t(beta_C_true)))
arg_list <- as.list(Data$call)[-1]
arg_list$n <- n_test
Test <- do.call(Fcomp_Model, arg_list)
## cv_cgl: Constrained group lasso
cv_cgl <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid,
keep = TRUE)
plot(cv_cgl,k = k_list)
cv_cgl$Ftrim[c("lam.min", "lam.1se")]
beta <- coef(cv_cgl, trim = FALSE, s = "lam.min")
k_opt <- cv_cgl$Ftrim$lam.min['df']
## plot path against L2-norm of group coefficients
plot(cv_cgl$FuncompCGL.fit[[as.character(k_opt)]])
## or plot path against L1-norm of group coefficients
plot(cv_cgl$FuncompCGL.fit[[as.character(k_opt)]], ylab = "L1")
m1 <- ifelse(is.null(ncol(Data$data$Zc)), 0, ncol(Data$data$Zc))
m1 <- m1 + Data$data$intercept
if(k_opt == df_beta) {
plot(Data$beta, col = "red", pch = 19,
ylim = range(c(range(Data$beta), range(beta))))
abline(v= seq(from = 0, to = (p*df_beta), by = df_beta ))
abline(h = 0)
points(beta)
if(m1 > 0) points(p*df_beta + 1:m1, tail(Data$beta, m1),
col = "blue", pch = 19)
} else {
plot(beta, ylim = range(c(range(Data$beta), range(beta))) )
abline(v= seq(from = 0, to = (p*k_opt), by = k_opt ))
abline(h = 0, col = "red")
if(m1 > 0) points(p*k_opt + 1:m1, tail(Data$beta, m1),
col = "blue", pch = 19)
}
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## satisfies zero-sum constraints
cat("colSums:", colSums(beta_C))
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
cat("selected groups:", Nonzero)
oldpar <- par(mfrow=c(2,1))
sseq <- Data$basis.info[, 1]
beta_curve_true <- Data$basis.info[, -1] %*% t(beta_C_true)
Nonzero_true <- (1:p)[apply(beta_C_true, 1, function(x) max(abs(x)) >0)]
matplot(sseq, beta_curve_true, type = "l", ylim = range(beta_curve_true),
ylab = "True coeffcients curves", xlab = "TIME")
abline(a = 0, b = 0, col = "grey", lwd = 2)
text(0, beta_curve_true[1, Nonzero_true], labels = Nonzero_true)
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
matplot(sseq, beta_curve, type = "l", ylim = range(beta_curve_true),
ylab = "Estimated coefficient curves", xlab = "TIME")
abline(a = 0, b = 0, col = "grey", lwd = 2)
text(0, beta_curve[1, Nonzero], labels = Nonzero)
par(oldpar)
## plot L1-norm of the estimated coefficients for each component of the composition
plot(apply(abs(beta_C),1,sum), ylab = "L1-norm", xlab = "Component index")
## or plot L2-norm
plot(apply(abs(beta_C),1, function(x) sqrt(sum(x^2))),
ylab = "L2-norm", xlab = "Component index")
## set a thresholding for variable selection via cross-validation model
## example 1: cut by average L2-norm for estimated coefficient curves
Curve_L2 <- colSums(beta_curve^2)
Curve_L2 <- Curve_L2 - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1])
Curve_L2 <- sqrt(Curve_L2)
plot(Curve_L2, xlab = "Component index", ylab = "L2-norm for coefficient curves")
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
Nonzero_cut
## example 2: cut by average L2-norm for estimated coefficient vectors
cutoff <- sum(apply(beta_C, 1, function(x) norm(x, "2")))/p
Nonzero_cut2 <- (1:p)[apply(beta_C, 1, function(x, a) norm(x, "2") >= a, a = cutoff)]
## example 3: cut by average L1-norm for estimated coefficient vectors
cutoff <- sum(abs(beta_C))/p
Nonzero_cut3 <- (1:p)[apply(beta_C, 1, function(x, a) sum(abs(x)) >= a, a = cutoff)]
y_hat <- predict(cv_cgl, Data$data$Comp, Data$data$Zc, s = "lam.min")
MSE <- sum((drop(Data$data$y) - y_hat)^2) / n_train
y_hat <- predict(cv_cgl, Test$data$Comp, Test$data$Zc, s = "lam.min")
PRE <- sum((drop(Test$data$y) - y_hat)^2) / n_test
cgl_result <- list(cv.result = cv_cgl, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
## cv_naive: ignoring the zero-sum constraints
## set mu_raio = 0 to identifying without linear constraints,
## no outer_loop for Lagrange augmented multiplier
cv_naive <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid, keep = TRUE,
mu_ratio = 0)
plot(cv_naive, k = k_list)
beta <- coef(cv_naive, trim = FALSE, s = "lam.min")
k_opt <- cv_naive$Ftrim$lam.min['df']
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## does NOT satisfy zero-sum constraints
cat("colSums:", colSums(beta_C))
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
Curve_L2 <- colSums(beta_curve^2) - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- sqrt(Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1]))
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
y_hat <- predict(cv_naive, Data$data$Comp, Data$data$Zc, s = "lam.min")
MSE <- sum((drop(Data$data$y) - y_hat)^2) / n_train
y_hat <- predict(cv_naive, Test$data$Comp, Test$data$Zc, s = "lam.min")
PRE <- sum((drop(Test$data$y) - y_hat)^2) / n_test
naive_result <- list(cv.result = cv_naive, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
## cv_base: random select a component as reference
## mu_ratio is set to 0 automatically once ref is set to a integer
ref = sample(1:p, 1)
cv_base <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid, keep = TRUE,
ref = ref)
plot(cv_base, k = k_list)
beta <- coef(cv_base, trim = FALSE, s = "lam.min")
k_opt <- cv_base$Ftrim$lam.min['df']
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## satisfies zero-sum constraints
cat("colSums:", colSums(beta_C))
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
Curve_L2 <- colSums(beta_curve^2) - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- sqrt(Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1]))
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
y_hat <- predict(cv_base, Data$data$Comp, Data$data$Zc, s = "lam.min")
MSE <- sum((drop(Data$data$y) - y_hat)^2) / n_train
y_hat <- predict(cv_base, Test$data$Comp, Test$data$Zc, s = "lam.min")
PRE <- sum((drop(Test$data$y) - y_hat)^2) / n_test
base_result <- list(cv.result = cv_base, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.