sim-with-sb.R
In SelectBoost: A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

## ----setup, include = FALSE---------------------------------------------------
#file.edit(normalizePath("~/.Renviron"))
LOCAL <- identical(Sys.getenv("LOCAL"), "TRUE")
#LOCAL=TRUE
knitr::opts_chunk$set(purl = LOCAL)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- eval = FALSE------------------------------------------------------------
#  devtools::install_github("fbertran/SelectBoost", ref = "doMC")

## ---- eval = LOCAL------------------------------------------------------------
library(SelectBoost)
group<-c(1:9,1) #10 variables
cor_group<-rep(0.95,9)
CM<-simulation_cor(group,cor_group)
CM

## ---- eval = LOCAL------------------------------------------------------------
set.seed(3141)
N<-10
X<-simulation_X(N,CM)
X

## ---- eval = LOCAL------------------------------------------------------------
set.seed(3141)
supp<-c(1,1,1,0,0,0,0,0,0,0)
minB<-1
maxB<-2
stn<-50
firstdataset=simulation_DATA(X,supp,minB,maxB,stn)
firstdataset

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
set.seed(3141)
NDatasets=200
for(i in 1:NDatasets){
X<-simulation_X(N,CM)
assign(paste("DATA_exemple1_nb_",i,sep=""),simulation_DATA(X,supp,minB,maxB,stn))
}

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
corr_sum=matrix(0,length(group),length(group))
for(i in 1:NDatasets){
corr_sum=corr_sum+cor(get(paste("DATA_exemple1_nb_",i,sep=""))$X)
}
corr_mean=corr_sum/NDatasets

## ---- fig.width=7, eval = LOCAL-----------------------------------------------
corr_mean
plot(abs(corr_mean))

## ---- eval = LOCAL------------------------------------------------------------
coef_sum=rep(0,length(group))
names(coef_sum)<-paste("x",1:length(group),sep="")
error_counter=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple1_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple1_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter=error_counter+1
  } else{
coef_sum=coef_sum+abs(tempcoef)
}
}
error_counter
coef_mean=coef_sum/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
coef_mean
barplot(coef_mean)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
set.seed(3141)
stn <- 5000
for(i in 1:NDatasets){
X<-simulation_X(N,CM)
assign(paste("DATA_exemple1_bis_nb_",i,sep=""),simulation_DATA(X,supp,minB,maxB,stn))
}

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
stn_ratios=rep(0,NDatasets)
for(i in 1:NDatasets){
stn_ratios[i]<-get(paste("DATA_exemple1_nb_",i,sep=""))$sigma/
  get(paste("DATA_exemple1_bis_nb_",i,sep=""))$sigma
}
all(sapply(stn_ratios,all.equal,10))

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
coef_sum_bis=rep(0,length(group))
names(coef_sum_bis)<-paste("x",1:length(group),sep="")
error_counter_bis=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple1_bis_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple1_bis_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter_bis=error_counte_bisr+1
  } else{
coef_sum_bis=coef_sum_bis+abs(tempcoef)
}
}
error_counter_bis
coef_mean_bis=coef_sum_bis/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
coef_mean_bis
barplot(coef_mean_bis)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- eval = LOCAL------------------------------------------------------------
group<-rep(1,50) #50 variables
cor_group<-rep(0.5,1)

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
set.seed(3141)
N<-20
supp<-c(1,1,1,1,1,rep(0,45))
minB<-1
maxB<-2
stn<-50
for(i in 1:200){
C<-simulation_cor(group,cor_group)
X<-simulation_X(N,C)
assign(paste("DATA_exemple2_nb_",i,sep=""),simulation_DATA(X,supp,minB,maxB,stn))
}

## ---- eval = LOCAL------------------------------------------------------------
corr_sum=matrix(0,length(group),length(group))
for(i in 1:NDatasets){
corr_sum=corr_sum+cor(get(paste("DATA_exemple2_nb_",i,sep=""))$X)
}
corr_mean=corr_sum/NDatasets

## ---- fig.keep='none', fig.width=7, eval = LOCAL------------------------------
corr_mean[1:10,1:10]
plot(abs(corr_mean))

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
coef_sum=rep(0,length(group))
coef_lasso_sum=rep(0,length(group))
names(coef_sum)<-paste("x",1:length(group),sep="")
names(coef_lasso_sum)<-paste("x",1:length(group),sep="")
error_counter=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple2_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple2_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
require(lars)
lasso.1 <- lars::lars(x=get(paste("DATA_exemple2_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple2_nb_",i,sep=""))$Y, type="lasso", 
              trace=FALSE, normalize=FALSE, intercept = FALSE)
# cv.lars() uses crossvalidation to estimate optimal position in path
cv.lasso.1 <- lars::cv.lars(x=get(paste("DATA_exemple2_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple2_nb_",i,sep=""))$Y, 
              plot.it=FALSE, type="lasso")
# Use the "+1SE rule" to find best model: 
#    Take the min CV and add its SE ("limit").  
#    Find smallest model that has its own CV within this limit (at "s.cv.1")
limit <- min(cv.lasso.1$cv) + cv.lasso.1$cv.error[which.min(cv.lasso.1$cv)]
s.cv.1 <- cv.lasso.1$index[min(which(cv.lasso.1$cv < limit))]
# Print out coefficients at optimal s.
coef_lasso_sum=coef_lasso_sum+abs(coef(lasso.1, s=s.cv.1, mode="fraction"))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter=error_counter+1
  } else{
coef_sum=coef_sum+abs(tempcoef)
}
}
error_counter
coef_mean=coef_sum/NDatasets
coef_lasso_mean=coef_lasso_sum/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
coef_mean
barplot(coef_mean)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- eval = LOCAL------------------------------------------------------------
coef_lasso_mean
barplot(coef_lasso_mean,ylim=c(0,1.5))
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- fig.keep='none', cache=TRUE, eval = LOCAL-------------------------------
require(CascadeData)
data(micro_S)
data(micro_US)
require(Cascade)
micro_US<-as.micro_array(micro_US,c(60,90,240,390),6)
micro_S<-as.micro_array(micro_S,c(60,90,240,390),6)
S<-geneSelection(list(micro_S,micro_US),list("condition",c(1,2),1),-1)
Sel<-micro_S@microarray[S@name,]

## ---- fig.keep='none', cache=TRUE, eval = LOCAL-------------------------------
summary(S)

## ---- fig.keep='last', cache=TRUE, eval = LOCAL-------------------------------
plot(S)

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
set.seed(3141)
supp<-c(1,1,1,1,1,rep(0,95))
minB<-1
maxB<-2
stn<-50
NDatasets=200

for(i in 1:NDatasets){
X<-t(as.matrix(Sel[sample(1:nrow(Sel),100),]))
Xnorm<-t(t(X)/sqrt(colSums(X*X)))
assign(paste("DATA_exemple3_nb_",i,sep=""),simulation_DATA(Xnorm,supp,minB,maxB,stn))
}

## ---- eval=FALSE--------------------------------------------------------------
#  plot(cor(Xnorm))
#  mixOmics::cim(cor(Xnorm))

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
coef_sum=rep(0,length(supp))
coef_lasso_sum=rep(0,length(supp))
names(coef_sum)<-paste("x",1:length(supp),sep="")
names(coef_lasso_sum)<-paste("x",1:length(supp),sep="")
error_counter=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple3_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple3_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
require(lars)
lasso.1 <- lars::lars(x=get(paste("DATA_exemple3_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple3_nb_",i,sep=""))$Y, type="lasso", 
              trace=FALSE, normalize=FALSE, intercept = FALSE)
# cv.lars() uses crossvalidation to estimate optimal position in path
cv.lasso.1 <- lars::cv.lars(x=get(paste("DATA_exemple3_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple3_nb_",i,sep=""))$Y, 
              plot.it=FALSE, normalize=FALSE, intercept = FALSE, type="lasso")
# Use the "+1SE rule" to find best model: 
#    Take the min CV and add its SE ("limit").  
#    Find smallest model that has its own CV within this limit (at "s.cv.1")
limit <- min(cv.lasso.1$cv) + cv.lasso.1$cv.error[which.min(cv.lasso.1$cv)]
s.cv.1 <- cv.lasso.1$index[min(which(cv.lasso.1$cv < limit))]
# Print out coefficients at optimal s.
coef_lasso_sum=coef_lasso_sum+abs(coef(lasso.1, s=s.cv.1, mode="fraction"))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter=error_counter+1
  } else{
coef_sum=coef_sum+abs(tempcoef)
}
}
error_counter
coef_mean=coef_sum/NDatasets
coef_lasso_mean=coef_lasso_sum/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
coef_mean
barplot(coef_mean)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- eval = LOCAL------------------------------------------------------------
coef_lasso_mean
barplot(coef_lasso_mean,ylim=c(0,1.5))
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- fig.keep='none', cache=TRUE, eval = LOCAL-------------------------------
require(CascadeData)
data(micro_S)
data(micro_US)
require(Cascade)
micro_US<-as.micro_array(micro_US,c(60,90,240,390),6)
micro_S<-as.micro_array(micro_S,c(60,90,240,390),6)
S<-geneSelection(list(micro_S,micro_US),list("condition",c(1,2),1),101)
Sel<-micro_S@microarray[S@name,]

## ---- fig.keep='none', cache=TRUE, eval = LOCAL-------------------------------
summary(S)

## ---- fig.keep='last', cache=TRUE, eval = LOCAL-------------------------------
plot(S)

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
suppt<-rep(1:4,6)
supp<-c(1,1,1,1,1,rep(0,95)) #not used since we use one of the probeset expressions as response
minB<-1 #not used since we use one of the probeset expressions as response
maxB<-2 #not used since we use one of the probeset expressions as response
stn<-50 #not used since we use one of the probeset expressions as response
NDatasets<-101

set.seed(3141)
for(i in 1:NDatasets){
  #the explanatory variables are the values for the 1st, 2nd and 3rd timepoints
  X<-t(as.matrix(Sel[-i,suppt!=4]))
  Xnorm<-t(t(X)/sqrt(colSums(X*X)))
  DATA<-simulation_DATA(Xnorm,supp,minB,maxB,stn)
  #the reponses are the values for the 2nd, 3rd and 4th timepoints
  DATA$Y<-as.vector(t(Sel[i,suppt!=1]))
  assign(paste("DATA_exemple4_nb_",i,sep=""),DATA)
  rm(DATA)
}

## ---- eval=FALSE--------------------------------------------------------------
#  plot(cor(Xnorm))
#  mixOmics::cim(cor(Xnorm))

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
coef_sum=rep(0,length(supp)+1)
coef_lasso_sum=rep(0,length(supp)+1)
names(coef_sum)<-paste("x",1:(length(supp)+1),sep="")
names(coef_lasso_sum)<-paste("x",1:(length(supp)+1),sep="")
error_counter=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple4_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple4_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
require(lars)
lasso.1 <- lars::lars(x=get(paste("DATA_exemple4_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple4_nb_",i,sep=""))$Y, type="lasso", 
              trace=FALSE, normalize=FALSE, intercept = FALSE)
# cv.lars() uses crossvalidation to estimate optimal position in path
cv.lasso.1 <- lars::cv.lars(x=get(paste("DATA_exemple4_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple4_nb_",i,sep=""))$Y, 
              plot.it=FALSE, normalize=FALSE, intercept = FALSE, type="lasso")
# Use the "+1SE rule" to find best model: 
#    Take the min CV and add its SE ("limit").  
#    Find smallest model that has its own CV within this limit (at "s.cv.1")
limit <- min(cv.lasso.1$cv) + cv.lasso.1$cv.error[which.min(cv.lasso.1$cv)]
s.cv.1 <- cv.lasso.1$index[min(which(cv.lasso.1$cv < limit))]
# Print out coefficients at optimal s.
coef_lasso_sum[-i]=coef_lasso_sum[-i]+abs(coef(lasso.1, s=s.cv.1, mode="fraction"))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter=error_counter+1
  } else{
coef_sum[-i]=coef_sum[-i]+abs(tempcoef)
}
}
error_counter
coef_mean=coef_sum/NDatasets
coef_lasso_mean=coef_lasso_sum/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
head(coef_mean, 40)
barplot(coef_mean)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- eval = LOCAL------------------------------------------------------------
head(coef_lasso_mean, 40)
barplot(coef_lasso_mean)

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
  group<-rep(1,500) #500 variables
  cor_group<-rep(0.5,1)

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
set.seed(3141)
N<-25
supp<-c(1,1,1,1,1,rep(0,495))
minB<-1
maxB<-2
stn<-50
for(i in 1:NDatasets){
  C<-simulation_cor(group,cor_group)
  X<-simulation_X(N,C)
  assign(paste("DATA_exemple5_nb_",i,sep=""),simulation_DATA(X,supp,minB,maxB,stn))
}

## ---- eval = LOCAL------------------------------------------------------------
corr_sum=matrix(0,length(group),length(group))
for(i in 1:NDatasets){
corr_sum=corr_sum+cor(get(paste("DATA_exemple5_nb_",i,sep=""))$X)
}
corr_mean=corr_sum/NDatasets

## ---- fig.keep='none', fig.width=7, eval = LOCAL------------------------------
corr_mean[1:10,1:10]
plot(abs(corr_mean))

## ---- cache=TRUE, eval = LOCAL------------------------------------------------
coef_sum=rep(0,length(group))
coef_lasso_sum=rep(0,length(group))
names(coef_sum)<-paste("x",1:length(group),sep="")
names(coef_lasso_sum)<-paste("x",1:length(group),sep="")
error_counter=0
for(i in 1:NDatasets){
tempdf=data.frame(cbind(Y=get(paste("DATA_exemple5_nb_",i,sep=""))$Y,
                        get(paste("DATA_exemple5_nb_",i,sep=""))$X))
tempcoef=coef(lm(Y~.-1,data=tempdf))
require(lars)
lasso.1 <- lars::lars(x=get(paste("DATA_exemple5_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple5_nb_",i,sep=""))$Y, type="lasso", 
              trace=FALSE, normalize=FALSE, intercept = FALSE)
# cv.lars() uses crossvalidation to estimate optimal position in path
cv.lasso.1 <- lars::cv.lars(x=get(paste("DATA_exemple5_nb_",i,sep=""))$X,
              y=get(paste("DATA_exemple5_nb_",i,sep=""))$Y, 
              plot.it=FALSE, type="lasso")
# Use the "+1SE rule" to find best model: 
#    Take the min CV and add its SE ("limit").  
#    Find smallest model that has its own CV within this limit (at "s.cv.1")
limit <- min(cv.lasso.1$cv) + cv.lasso.1$cv.error[which.min(cv.lasso.1$cv)]
s.cv.1 <- cv.lasso.1$index[min(which(cv.lasso.1$cv < limit))]
# Print out coefficients at optimal s.
coef_lasso_sum=coef_lasso_sum+abs(coef(lasso.1, s=s.cv.1, mode="fraction"))
if(is.null(tempcoef)){
cat("Error in lm fit, skip coefficients\n")
error_counter=error_counter+1
  } else{
coef_sum=coef_sum+abs(tempcoef)
}
}
error_counter
coef_mean=coef_sum/NDatasets
coef_lasso_mean=coef_lasso_sum/NDatasets

## ---- eval = LOCAL------------------------------------------------------------
head(coef_mean, 40)
barplot(coef_mean)
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")

## ---- eval = LOCAL------------------------------------------------------------
head(coef_lasso_mean, 40)
barplot(coef_lasso_mean,ylim=c(0,1.5))
abline(h=(minB+maxB)/2,lwd=2,lty=2,col="blue")
Any scripts or data that you put into this service are public.
SelectBoost documentation built on Dec. 1, 2022, 1:27 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SelectBoost
A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

inst/doc/sim-with-sb.R
In SelectBoost: A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

Try the SelectBoost package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SelectBoost A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

inst/doc/sim-with-sb.R In SelectBoost: A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

Try the SelectBoost package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SelectBoost
A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets

inst/doc/sim-with-sb.R
In SelectBoost: A General Algorithm to Enhance the Performance of Variable Selection Methods in Correlated Datasets