Example_tuning_free_ridge_regression
In ConformalSmallest: Efficient Tuning-Free Conformal Prediction

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

knitr::opts_chunk$set(error = TRUE)

library(ConformalSmallest)
library(ggplot2)

This vignette is dived into three parts. 1. we explain how we use EFCP and VFCP vs. cross validation (CV) and other parametric methods for ridge regression 2. we generate the plots as shown in our paper 3. we compare the tuning parameters chosen by EFCP, VFCP, CV using two splits and CV using three splits.

Linear setting

For the purpose of this example, the code generating chunk will not be run and we save the outcome data in advance to make the plots.

library(glmnet)
library(MASS)
library(mvtnorm)
#source("ridge_funs.R")
name=paste("linear_fm_t3",sep="")


df <- 3  #degrees of freedom
l <- 60    #number of dimensions 
l.lambda <- 100
lambda_seq <- seq(0,200,l=l.lambda)
dim <- round(seq(5,300,l=l))
alpha <- 0.1
n <- 200   #number of training samples
n0 <- 100  #number of prediction points
nrep <- 100 #number of independent trials
rho <- 0.5

cov.efcp_linear_fm_t3 <- len.efcp_linear_fm_t3 <- matrix(0,nrep,l)
cov.vfcp_linear_fm_t3 <- len.vfcp_linear_fm_t3 <- matrix(0,nrep,l)
cov.naive_linear_fm_t3 <- len.naive_linear_fm_t3 <- matrix(0,nrep,l)
cov.param_linear_fm_t3 <- len.param_linear_fm_t3 <- matrix(0,nrep,l)
cov.star_linear_fm_t3 <- len.star_linear_fm_t3 <- matrix(0,nrep,l)
cov.cv10_linear_fm_t3 <- len.cv10_linear_fm_t3 <- matrix(0,nrep,l)
cov.cv5_linear_fm_t3 <- len.cv5_linear_fm_t3 <- matrix(0,nrep,l)
cov.cvloo_linear_fm_t3 <- len.cvloo_linear_fm_t3 <- matrix(0,nrep,l)


out.efcp.up <- out.efcp.lo <- matrix(0,n0,l)
out.vfcp.up <- out.vfcp.lo <- matrix(0,n0,l)
out.naive.up <- out.naive.lo <- matrix(0,n0,l)
out.param.up <- out.param.lo <- matrix(0,n0,l)
out.star.up <- out.star.lo <- matrix(0,n0,l)
out.cv10.up <- out.cv10.lo <- matrix(0,n0,l)
out.cv5.up <- out.cv5.lo <- matrix(0,n0,l)
out.cvloo.up <- out.cvloo.lo <- matrix(0,n0,l)


for(i in 1:nrep){
  if(i%%10 == 0){
    print(i)
  }
  for (r in 1:l){
    d <- dim[r]
    set.seed(i)

    Sigma <- matrix(rho,d,d)
    diag(Sigma) <- rep(1,d)
    X <- rmvt(n,Sigma,df)   #multivariate t distribution
    beta <- rep(1:5,d/5)
    eps <- rt(n,df)*(1+sqrt(X[,1]^2+X[,2]^2))
    Y <- X%*%beta+eps


    X0 <- rmvt(n0,Sigma,df)
    eps0 <- rt(n0,df)*(1+sqrt(X0[,1]^2+X0[,2]^2))
    Y0 <- X0%*%beta+eps0


    out.param <- ginverse.fun(X,Y,X0,alpha=alpha)
    out.param.lo[,r] <- out.param$lo
    out.param.up[,r] <- out.param$up    
    cov.param_linear_fm_t3[i,r] <- mean(out.param.lo[,r] <= Y0 & Y0 <= out.param.up[,r]) 
    len.param_linear_fm_t3[i,r] <- mean(out.param.up[,r]-out.param.lo[,r]) 


    out.efcp <- efcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.efcp.up[,r] <- out.efcp$up
    out.efcp.lo[,r] <- out.efcp$lo    
    cov.efcp_linear_fm_t3[i,r] <- mean(out.efcp.lo[,r] <= Y0 & Y0 <= out.efcp.up[,r]) 
    len.efcp_linear_fm_t3[i,r] <- mean(out.efcp.up[,r]-out.efcp.lo[,r]) 


    out.vfcp <- vfcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.vfcp.up[,r] <- out.vfcp$up
    out.vfcp.lo[,r] <- out.vfcp$lo    
    cov.vfcp_linear_fm_t3[i,r] <- mean(out.vfcp.lo[,r] <= Y0 & Y0 <= out.vfcp.up[,r]) 
    len.vfcp_linear_fm_t3[i,r] <- mean(out.vfcp.up[,r]-out.vfcp.lo[,r]) 

    out.naive <- naive.fun(X,Y,X0,alpha=alpha)
    out.naive.up[,r] <- out.naive$up
    out.naive.lo[,r] <- out.naive$lo    
    cov.naive_linear_fm_t3[i,r] <- mean(out.naive.lo[,r] <= Y0 & Y0 <= out.naive.up[,r]) 
    len.naive_linear_fm_t3[i,r] <- mean(out.naive.up[,r]-out.naive.lo[,r]) 


    out.star <- star.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.star.up[,r] <- out.star$up
    out.star.lo[,r] <- out.star$lo   
    cov.star_linear_fm_t3[i,r] <- mean(out.star.lo[,r] <= Y0 & Y0 <= out.star.up[,r]) 
    len.star_linear_fm_t3[i,r] <- mean(out.star.up[,r] - out.star.lo[,r]) 


    out.cv5 <- cv.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha,nfolds=5)
    out.cv5.up[,r] <- out.cv5$up
    out.cv5.lo[,r] <- out.cv5$lo    
    cov.cv5_linear_fm_t3[i,r] <- mean(out.cv5.lo[,r] <= Y0 & Y0 <= out.cv5.up[,r]) 
    len.cv5_linear_fm_t3[i,r] <- mean(out.cv5.up[,r] - out.cv5.lo[,r])

  }
}

ridge_linear_len100_t3=list(len.param_linear_fm_t3, len.naive_linear_fm_t3, len.vfcp_linear_fm_t3, len.star_linear_fm_t3, len.cv5_linear_fm_t3, len.efcp_linear_fm_t3)
save(ridge_linear_len100_t3, file = "ridge_linear_len100_t3.rda" )
ridge_linear_cov100_t3=list(dim_linear_t3,cov.param_linear_fm_t3, cov.naive_linear_fm_t3, cov.vfcp_linear_fm_t3, cov.star_linear_fm_t3, cov.cv5_linear_fm_t3, cov.efcp_linear_fm_t3)
save(ridge_linear_cov100_t3, file = "ridge_linear_cov100_t3.rda" )

Similarly we do the same when the degree of freedom is 5 (code omitted).

library(glmnet)
library(MASS)
library(mvtnorm)
name=paste("linear_fm_t5",sep="")


df <- 5  #degrees of freedom
l <- 60    #number of dimensions 
l.lambda <- 100
lambda_seq <- seq(0,200,l=l.lambda)
dim <- round(seq(5,300,l=l))
alpha <- 0.1
n <- 200   #number of training samples
n0 <- 100  #number of prediction points
nrep <- 100 #number of independent trials
rho <- 0.5

cov.efcp_linear_fm_t5 <- len.efcp_linear_fm_t5 <- matrix(0,nrep,l)
cov.vfcp_linear_fm_t5 <- len.vfcp_linear_fm_t5 <- matrix(0,nrep,l)
cov.naive_linear_fm_t5 <- len.naive_linear_fm_t5 <- matrix(0,nrep,l)
cov.param_linear_fm_t5 <- len.param_linear_fm_t5 <- matrix(0,nrep,l)
cov.star_linear_fm_t5 <- len.star_linear_fm_t5 <- matrix(0,nrep,l)
cov.cv10_linear_fm_t5 <- len.cv10_linear_fm_t5 <- matrix(0,nrep,l)
cov.cv5_linear_fm_t5 <- len.cv5_linear_fm_t5 <- matrix(0,nrep,l)
cov.cvloo_linear_fm_t5 <- len.cvloo_linear_fm_t5 <- matrix(0,nrep,l)


out.efcp.up <- out.efcp.lo <- matrix(0,n0,l)
out.vfcp.up <- out.vfcp.lo <- matrix(0,n0,l)
out.naive.up <- out.naive.lo <- matrix(0,n0,l)
out.param.up <- out.param.lo <- matrix(0,n0,l)
out.star.up <- out.star.lo <- matrix(0,n0,l)
out.cv10.up <- out.cv10.lo <- matrix(0,n0,l)
out.cv5.up <- out.cv5.lo <- matrix(0,n0,l)
out.cvloo.up <- out.cvloo.lo <- matrix(0,n0,l)



for(i in 1:nrep){
  if(i%%10 == 0){
    print(i)
  }
  for (r in 1:l){
    d <- dim[r]
    set.seed(i)

    Sigma <- matrix(rho,d,d)
    diag(Sigma) <- rep(1,d)
    X <- rmvt(n,Sigma,df)   #multivariate t distribution
    beta <- rep(1:5,d/5)
    eps <- rt(n,df)*(1+sqrt(X[,1]^2+X[,2]^2))
    Y <- X%*%beta+eps


    X0 <- rmvt(n0,Sigma,df)
    eps0 <- rt(n0,df)*(1+sqrt(X0[,1]^2+X0[,2]^2))
    Y0 <- X0%*%beta+eps0


    out.param <- ginverse.fun(X,Y,X0,alpha=alpha)
    out.param.lo[,r] <- out.param$lo
    out.param.up[,r] <- out.param$up    
    cov.param_linear_fm_t5[i,r] <- mean(out.param.lo[,r] <= Y0 & Y0 <= out.param.up[,r]) 
    len.param_linear_fm_t5[i,r] <- mean(out.param.up[,r]-out.param.lo[,r]) 


    out.efcp <- efcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.efcp.up[,r] <- out.efcp$up
    out.efcp.lo[,r] <- out.efcp$lo    
    cov.efcp_linear_fm_t5[i,r] <- mean(out.efcp.lo[,r] <= Y0 & Y0 <= out.efcp.up[,r]) 
    len.efcp_linear_fm_t5[i,r] <- mean(out.efcp.up[,r]-out.efcp.lo[,r]) 


    out.vfcp <- vfcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.vfcp.up[,r] <- out.vfcp$up
    out.vfcp.lo[,r] <- out.vfcp$lo    
    cov.vfcp_linear_fm_t5[i,r] <- mean(out.vfcp.lo[,r] <= Y0 & Y0 <= out.vfcp.up[,r]) 
    len.vfcp_linear_fm_t5[i,r] <- mean(out.vfcp.up[,r]-out.vfcp.lo[,r]) 

    out.naive <- naive.fun(X,Y,X0,alpha=alpha)
    out.naive.up[,r] <- out.naive$up
    out.naive.lo[,r] <- out.naive$lo    
    cov.naive_linear_fm_t5[i,r] <- mean(out.naive.lo[,r] <= Y0 & Y0 <= out.naive.up[,r]) 
    len.naive_linear_fm_t5[i,r] <- mean(out.naive.up[,r]-out.naive.lo[,r]) 


    out.star <- star.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    out.star.up[,r] <- out.star$up
    out.star.lo[,r] <- out.star$lo   
    cov.star_linear_fm_t5[i,r] <- mean(out.star.lo[,r] <= Y0 & Y0 <= out.star.up[,r]) 
    len.star_linear_fm_t5[i,r] <- mean(out.star.up[,r] - out.star.lo[,r]) 


    out.cv5 <- cv.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha,nfolds=5)
    out.cv5.up[,r] <- out.cv5$up
    out.cv5.lo[,r] <- out.cv5$lo    
    cov.cv5_linear_fm_t5[i,r] <- mean(out.cv5.lo[,r] <= Y0 & Y0 <= out.cv5.up[,r]) 
    len.cv5_linear_fm_t5[i,r] <- mean(out.cv5.up[,r] - out.cv5.lo[,r])

  }
}

ridge_linear_cov100_t5=list(dim_linear_t5,cov.param_linear_fm_t5, cov.naive_linear_fm_t5, cov.vfcp_linear_fm_t5, cov.star_linear_fm_t5, cov.cv5_linear_fm_t5, cov.efcp_linear_fm_t5)
save(ridge_linear_cov100_t5, file = "ridge_linear_cov100_t5.rda" )
ridge_linear_len100_t5=list(len.param_linear_fm_t5, len.naive_linear_fm_t5, len.vfcp_linear_fm_t5, len.star_linear_fm_t5, len.cv5_linear_fm_t5, len.efcp_linear_fm_t5)
save(ridge_linear_len100_t5, file = "ridge_linear_len100_t5.rda" )

Now we make the coverage and width efficiency plots as shown in our paper.

data(ridge_linear_cov100_t3,package = "ConformalSmallest")
data(ridge_linear_len100_t3,package = "ConformalSmallest")
dim=ridge_linear_cov100_t3[[1]]
cov.param_linear_fm_t3=ridge_linear_cov100_t3[[2]]
cov.naive_linear_fm_t3=ridge_linear_cov100_t3[[3]]
cov.vfcp_linear_fm_t3=ridge_linear_cov100_t3[[4]]
cov.star_linear_fm_t3=ridge_linear_cov100_t3[[5]]
cov.cv5_linear_fm_t3=ridge_linear_cov100_t3[[6]]
cov.efcp_linear_fm_t3=ridge_linear_cov100_t3[[7]]

len.param_linear_fm_t3=ridge_linear_len100_t3[[1]]
len.naive_linear_fm_t3=ridge_linear_len100_t3[[2]]
len.vfcp_linear_fm_t3=ridge_linear_len100_t3[[3]]
len.star_linear_fm_t3=ridge_linear_len100_t3[[4]]
len.cv5_linear_fm_t3=ridge_linear_len100_t3[[5]]
len.efcp_linear_fm_t3=ridge_linear_len100_t3[[6]]


len.param = len.param_linear_fm_t3/len.vfcp_linear_fm_t3
len.naive = len.naive_linear_fm_t3/len.vfcp_linear_fm_t3
len.star = len.star_linear_fm_t3/len.vfcp_linear_fm_t3
len.cv5 = len.cv5_linear_fm_t3/len.vfcp_linear_fm_t3
len.efcp = len.efcp_linear_fm_t3/len.vfcp_linear_fm_t3
len.vfcp = len.vfcp_linear_fm_t3/len.vfcp_linear_fm_t3


df.cov3=data.frame(dim,apply(cov.param_linear_fm_t3,2,mean),apply(cov.naive_linear_fm_t3,2,mean),apply(cov.vfcp_linear_fm_t3,2,mean),apply(cov.star_linear_fm_t3,2,mean),apply(cov.cv5_linear_fm_t3,2,mean), apply(cov.efcp_linear_fm_t3,2,mean))
df.cov3_sd=data.frame(dim,apply(cov.param_linear_fm_t3,2,sd),apply(cov.naive_linear_fm_t3,2,sd),apply(cov.vfcp_linear_fm_t3,2,sd),apply(cov.star_linear_fm_t3,2,sd),apply(cov.cv5_linear_fm_t3,2,sd), apply(cov.efcp_linear_fm_t3,2,sd))/sqrt(nrow(len.param))


df.len3=data.frame(dim,apply(len.param,2,mean),apply(len.naive,2,mean),apply(len.vfcp,2,mean),apply(len.star,2,mean),apply(len.cv5,2,mean), apply(len.efcp,2,mean))
df.len3_sd=data.frame(dim,apply(len.param,2,sd),apply(len.naive,2,sd),apply(len.vfcp,2,sd),apply(len.star,2,sd),apply(len.cv5,2,sd), apply(len.efcp,2,sd))/sqrt(nrow(len.param))


data(ridge_linear_cov100_t5,package = "ConformalSmallest")
data(ridge_linear_len100_t5,package = "ConformalSmallest")

dim=ridge_linear_cov100_t5[[1]]
cov.param_linear_fm_t5=ridge_linear_cov100_t5[[2]]
cov.naive_linear_fm_t5=ridge_linear_cov100_t5[[3]]
cov.vfcp_linear_fm_t5=ridge_linear_cov100_t5[[4]]
cov.star_linear_fm_t5=ridge_linear_cov100_t5[[5]]
cov.cv5_linear_fm_t5=ridge_linear_cov100_t5[[6]]
cov.efcp_linear_fm_t5=ridge_linear_cov100_t5[[7]]

len.param_linear_fm_t5=ridge_linear_len100_t5[[1]]
len.naive_linear_fm_t5=ridge_linear_len100_t5[[2]]
len.vfcp_linear_fm_t5=ridge_linear_len100_t5[[3]]
len.star_linear_fm_t5=ridge_linear_len100_t5[[4]]
len.cv5_linear_fm_t5=ridge_linear_len100_t5[[5]]
len.efcp_linear_fm_t5=ridge_linear_len100_t5[[6]]


len.param = len.param_linear_fm_t5/len.vfcp_linear_fm_t5
len.naive = len.naive_linear_fm_t5/len.vfcp_linear_fm_t5
len.star = len.star_linear_fm_t5/len.vfcp_linear_fm_t5
len.cv5 = len.cv5_linear_fm_t5/len.vfcp_linear_fm_t5
len.efcp = len.efcp_linear_fm_t5/len.vfcp_linear_fm_t5
len.vfcp = len.vfcp_linear_fm_t5/len.vfcp_linear_fm_t5

df.cov5=data.frame(dim,apply(cov.param_linear_fm_t5,2,mean),apply(cov.naive_linear_fm_t5,2,mean),apply(cov.vfcp_linear_fm_t5,2,mean),apply(cov.star_linear_fm_t5,2,mean),apply(cov.cv5_linear_fm_t5,2,mean), apply(cov.efcp_linear_fm_t5,2,mean))
df.cov5_sd=data.frame(dim,apply(cov.param_linear_fm_t5,2,sd),apply(cov.naive_linear_fm_t5,2,sd),apply(cov.vfcp_linear_fm_t5,2,sd),apply(cov.star_linear_fm_t5,2,sd),apply(cov.cv5_linear_fm_t5,2,sd), apply(cov.efcp_linear_fm_t5,2,sd))/sqrt(nrow(len.param))

df.len5=data.frame(dim,apply(len.param,2,mean),apply(len.naive,2,mean),apply(len.vfcp,2,mean),apply(len.star,2,mean),apply(len.cv5,2,mean), apply(len.efcp,2,mean))
df.len5_sd=data.frame(dim,apply(len.param,2,sd),apply(len.naive,2,sd),apply(len.vfcp,2,sd),apply(len.star,2,sd),apply(len.cv5,2,sd), apply(len.efcp,2,sd))/sqrt(nrow(len.param))


bgnd <- theme_get()$panel.background$fill
names=c("Linear","Naive","VFCP","CV*","CV-5-fold","EFCP")
#colors_manual=c("blue","#FF9933","#999000","66FFCC","#66CC99","#9999FF","#FF00CC")
colors_manual=c("red","slategrey","darkorchid3","dodgerblue4","turquoise3","grey23")

shape_manual=c(0,1,2,3,4,5)

seq=seq(2,60,by=2)

col_names=c("dim","cov.param","cov.naive","cov.vfcp","cov.star","cov.cv5","cov.efcp")
names(df.cov3)=names(df.cov5)=names(df.cov3_sd)=names(df.cov5_sd)=col_names


col_names=c("dim","len.param","len.naive","len.vfcp","len.star","len.cv5","len.efcp")
names(df.len3)=names(df.len5)=names(df.len3_sd)=names(df.len5_sd)=col_names

df.cov = rbind(df.cov3[seq,], df.cov5[seq,])
df.cov_sd = rbind(df.cov3_sd[seq,], df.cov5_sd[seq,])
df.len = rbind(df.len3[seq,], df.len5[seq,])
df.len_sd = rbind(df.len3_sd[seq,], df.len5_sd[seq,])

Moment = c("3rd moment", "5th moment")
df_names = expand.grid(seq,Moment,names)

cov_vec = as.vector(as.matrix(df.cov[,-1]))
cov_sd_vec = as.vector(as.matrix(df.cov_sd[,-1]))
cov = cbind(df_names[,1], cov_vec, cov_sd_vec , df_names[,-1] ) 


len_vec = as.vector(as.matrix(df.len[,-1]))
len_sd_vec = as.vector(as.matrix(df.len_sd[,-1]))
len = cbind(df_names[,1], len_vec, len_sd_vec , df_names[,-1]) 

cov$Var="Coverage"
len$Var = "Width ratio"

colnames(cov) = c("V1","V2","sd","Moment","Method","Var")
colnames(len) = c("V1","V2","sd","Moment","Method","Var")

data_linear_fm=rbind(cov,len)
dummy_coverage <- data.frame(Var=c("Coverage"),Z= 0.9)

ggplot_linear_fm=ggplot(data=data_linear_fm, aes(x=V1, y=V2, group=Method,color=Method)) +
  geom_errorbar(aes(ymin=V2-sd, ymax=V2+sd), width=.1)+
  geom_line(size = 0.8, aes(color=Method))+  #,linetype=Method)) +
  #geom_point(size=5, colour=bgnd)+
  #geom_point(aes(shape=Method,color=Method)) +
  theme(#axis.text.y = element_blank(), 
    #axis.ticks.y = element_blank(), 
    axis.title.y = element_blank(),
    #plot.title = element_text(size = 8,hjust=0.5),
    #axis.text = element_text(size = 10),
    #axis.title =element_text(size = 10),
    legend.position="top") +
  facet_grid(Var~Moment,scales = "free")+
  #scale_shape_manual(values=shape_manual) + 
  scale_color_manual(values=colors_manual) +
  #ylim(0,1.2) + #xlab("dim") + ylab("Avg-Len")  
  xlab("Dimension")+guides(shape=FALSE)+
  theme(strip.text.x = element_text(size = 12, face = "bold"), strip.text.y = element_text(size=12, face="bold"), axis.ticks.length=unit(.25, "cm"),axis.title=element_text(size=12))+ geom_hline(data=dummy_coverage, aes(yintercept=Z), linetype="dashed")
ggplot_linear_fm

Finally we compare the choice of $\lambda$ chosen by different methods.

library(repr)
library(glmnet)
library(MASS)
library(mvtnorm)
name=paste("linear_fm_t3",sep="")
set.seed(2021)

df <- 3  #degrees of freedom
l <- 1    #number of dimensions 
l.lambda <- 100
lambda_seq <- seq(0,200,l=l.lambda)
dim <- 5
alpha <- 0.1
n <- 200   #number of training samples
n0 <- 100  #number of prediction points
nrep <- 100 #number of independent trials
rho <- 0.5

lambda.efcp <- lambda.vfcp <- lambda.star <- lambda.cv5 <- rep(0,nrep)


for(i in 1:nrep){
  for (r in 1:l){
    d <- dim[r]
    set.seed(i)

    Sigma <- matrix(rho,d,d)
    diag(Sigma) <- rep(1,d)
    X <- rmvt(n,Sigma,df)   #multivariate t distribution
    beta <- rep(1:5,d/5)
    eps <- rt(n,df)*(1+sqrt(X[,1]^2+X[,2]^2))
    Y <- X%*%beta+eps

    X0 <- rmvt(n0,Sigma,df)
    eps0 <- rt(n0,df)*(1+sqrt(X0[,1]^2+X0[,2]^2))
    Y0 <- X0%*%beta+eps0



    out.efcp <- efcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.efcp[i] <- out.efcp$lambda

    out.vfcp <- vfcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.vfcp[i] <- out.vfcp$lambda

    out.star <- star.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.star[i] <- out.star$lambda

    out.cv5 <- cv.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha,nfolds=5)
    lambda.cv5[i] <- out.cv5$lambda
  }
}
#options(repr.plot.width=18, repr.plot.height=6)
#par(mar=c(1,1,1,1))
oldpar=par(mfrow=c(2,2))
hist(lambda.cv5,breaks=seq(min(lambda.cv5)-1,max(lambda.cv5)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="CV-2splits") #make x,y, values, main larger
hist(lambda.star,breaks=seq(min(lambda.star)-1,max(lambda.star)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="CV-3splits")
hist(lambda.efcp,breaks=seq(min(lambda.efcp)-1,max(lambda.efcp)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="EFCP")
hist(lambda.vfcp,breaks=seq(min(lambda.vfcp)-1,max(lambda.vfcp)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="VFCP")

The above plot shows that all four methods mostly use $\lambda$ close to zero, which is expected given that we are considering a dimension example where the underlying data generating mechanism is a linear setting.

Nonlinear setting

name=paste("nonlinear_fm_t3",sep="")
set.seed(2021)

df <- 3  #degrees of freedom
l <- 1    #number of dimensions 
l.lambda <- 100
lambda_seq <- seq(0,200,l=l.lambda)
dim <- 5
alpha <- 0.1
n <- 200   #number of training samples
n0 <- 100  #number of prediction points
nrep <- 100 #number of independent trials
rho <- 0.5

lambda.efcp <- lambda.vfcp <- lambda.star <- lambda.cv5 <- rep(0,nrep)


for(i in 1:nrep){
  for (r in 1:l){
    d <- dim[r]
    set.seed(i)

    Sigma=matrix(rho,d,d)
    diag(Sigma)=rep(1,d)
    X=rmvt(n,Sigma,df)  #multivariate t distribution

    eps1=rt(n,df)*(1+sqrt(X[,1]^2+X[,2]^2))
    eps2=rt(n,df)*(1+sqrt(X[,1]^4+X[,2]^4))
    Y=rpois(n,sin(X[,1])^2 + cos(X[,2])^4+0.01 )+0.03*X[,1]*eps1+25*(runif(n,0,1)<0.01*eps2)

    X0=rmvt(n0,Sigma,df)
    eps01=rt(n0,df)*(1+sqrt(X0[,1]^2+X0[,2]^2))
    eps02=rt(n0,df)*(1+sqrt(X0[,1]^4+X0[,2]^4))
    Y0=rpois(n0,sin(X0[,1])^2 + cos(X0[,2])^4+0.01 )+0.03*X0[,1]*eps01+25*(runif(n0,0,1)<0.01)*eps02

    out.efcp <- efcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.efcp[i] <- out.efcp$lambda

    out.vfcp <- vfcp_ridge(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.vfcp[i] <- out.vfcp$lambda

    out.star <- star.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha)
    lambda.star[i] <- out.star$lambda

    out.cv5 <- cv.fun(X,Y,X0,lambda=lambda_seq,alpha=alpha,nfolds=5)
    lambda.cv5[i] <- out.cv5$lambda
  }
}

#options(repr.plot.width=18, repr.plot.height=6)
oldpar=par(mfrow=c(2,2))
#par(mar=c(1,1,1,1))
hist(lambda.cv5,breaks=seq(min(lambda.cv5)-1,max(lambda.cv5)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="CV-2splits") #make x,y, values, main larger
hist(lambda.star,breaks=seq(min(lambda.star)-1,max(lambda.star)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="CV-3splits")
hist(lambda.efcp,breaks=seq(min(lambda.efcp)-1,max(lambda.efcp)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="EFCP")
hist(lambda.vfcp,breaks=seq(min(lambda.vfcp)-1,max(lambda.vfcp)+1, by = 2),xlab=expression(lambda), cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5,main="VFCP")
par(oldpar)

The above plots show that all four methods mostly choose the penalty parameter $\lambda$ to be as large as possible, which is also expected given that the underlying data generating mechanism is a nonlinear setting.