#' Create molecular descriptors and MoA features interaction graph
#'
#' This function takes in input one row of the metrics matrix returned by the tqsar and hqsar functions and plot the interaction graph of genes and MDs
#' @param model_info is one row of the metrics matrix
#' @param MDMat is the matrix of modelucalr descriptors
#' @param GeneMat is the matrix of MOA features
#' @param neg_th is a threshold that the user can set up to filter the graph from negative endges. If not null, the plot will contain only the edges with value lower that the neg_th quantile of the distribution of the negative edges weights
#' @param pos_th is a threshold that the user can set up to filter the graph from negative endges. If not null, the plot will contain only the edges with value higher that the pos_th quantile of the distribution of the positive edges weights
#' @keywords plot interaction graph
#' @export
#' @examples
#'
plot_MD_GE_graph = function(model_info, MDMat, GeneMat, neg_th = NULL, pos_th = NULL){
# model_info = abs.alpha.power.res.filtered.metrics["29",]
# MDMat = rbind(X_md[train.idx,],X_md[test.idx,])
# GeneMat = rbind(X_ge[train.idx,],X_ge[test.idx,])
coeff = round(as.numeric(unlist(strsplit(as.character(model_info[1,"coefficients"]),split=";"))),4)
MDs = unlist(strsplit(as.character(model_info[1,"MDlist"]),split = ";"))
GEs = unlist(strsplit(as.character(model_info[1,"GEList"]),split = ";"))
nodes = c(MDs, GEs)
M = matrix(0, nrow = length(nodes), ncol = length(nodes))
colnames(M) = rownames(M) = nodes
for(i in nodes){
if(i %in% colnames(MDMat)){
iv = MDMat[,i]
}else{
iv = GeneMat[,i]
}
for(j in nodes){
if(j %in% colnames(MDMat)){
jv = MDMat[,j]
}else{
jv = GeneMat[,j]
}
M[i,j] = M[j,i] = cor(iv, jv)
}
}
diag(M) = 0
if(!is.null(neg_th) & is.null(pos_th)){
neg_q = quantile(M[M<0], neg_th)
M[ M > neg_q & M < 0] = 0
}
if(!is.null(pos_th) & is.null(neg_th)){
pos_q = quantile(M[M>=0], pos_th)
M[ M < pos_q & M > 0] = 0
}
if(!is.null(pos_th) & !is.null(neg_th)){
neg_q = quantile(M[M<0], neg_th)
pos_q = quantile(M[M>=0], pos_th)
M[ M > neg_q & M< pos_q] = 0
}
g = igraph::graph.adjacency(M, weighted = T, mode = "undirected")
vcol = rep( adjustcolor( "red", alpha.f = 0.1),nrow(M)) #rep("pink",6)
names(vcol) = rownames(M)
vcol[MDs] = adjustcolor( "blue", alpha.f = 0.1)
v.lab.col = rep( "darkred",nrow(M)) #rep("pink",6)
names(v.lab.col) = rownames(M)
v.lab.col[MDs] = "darkblue"
ecol = rep("gold",length(igraph::E(g)$weight))
ecol[sign((igraph::E(g)$weight))<0] = "darkgreen"
par(mar = c(0,2,2,0))
igraph::plot.igraph(g, vertex.color = vcol, vertex.label.cex = 0.8, vertex.frame.color = vcol,
edge.width = abs(igraph::E(g)$weight * 2), vertex.label.color =v.lab.col,
edge.color = ecol,
layout = igraph::layout.star)
legend("topleft",ncol = 2, bty = "n",cex = 0.8,
legend = c("Pos. Beta","Neg. Beta","Pos. Cor","Neg. Cor", "MDs","Genes"),
fill = c("lightblue","pink","gold","darkgreen", "darkblue","darkred"))
}
#' Save prediction results in a predefined folder
#'
#' This function takes in input the results of the analysis and plot the williams plot, the scatterplot of experimental versus predicted values as a pdf and store the predicted values in a csv file
#' @param res is an object of hqsar class (resulting from the hqsar function)
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' @param xlab is the text for the x-axis
#' @param ylab is the text for the y-axis
#' @param start_path is the path of the folder where to store the files
#' @keywords save results
#' @export
#' @examples
#'
build_predicted_results = function(res, type.transformation, xlab = "Experimental HSB", ylab="Predicted HSB", start_path = "/home/MOKA/Angela/angela/hQSAR_test/res/predictions/"){
filtMet = filter_metric_models(Info = res$Metrics)
write.table(res$Metrics,file = paste(start_path,type.transformation,"_unfiltered_metrics.csv",sep=""), quote = FALSE, sep="\t", row.names = TRUE, col.names = TRUE)
write.table(filtMet,file = paste(start_path,type.transformation,"_filtered_metrics.csv",sep=""), quote = FALSE, sep="\t", row.names = TRUE, col.names = TRUE)
nres = length(res$pred_tr)
for(i in 1:nres){
pdf(file = paste(start_path,type.transformation,"_WP_",i,".pdf",sep=""))
plot_wp(res$williams_plots[[i]]$DTP)
dev.off()
pdf(file = paste(start_path,type.transformation,"_experimental_predicted_",i,".pdf",sep=""))
hyQSAR:::plot_pred_real(pred_train=res$pred_tr[[i]],
pred_test=res$pred_te[[i]],
train_class = res$y_train, test_class=res$y_test, xlab, ylab)
dev.off()
PredVal = cbind(rbind(cbind(res$pred_tr[[i]], res$y_train, abs(res$pred_tr[[i]] - res$y_train)),
cbind(res$pred_te[[i]], res$y_test, abs(res$pred_te[[i]] - res$y_test))),
c(rep("Train", length(res$y_train)),rep("Test", length(res$y_test))))
colnames(PredVal) = c("Predicted","Experimental","Resuidusl","Train/Test")
write.table(PredVal,file = paste(start_path,type.transformation,"_exp_vs_pred_",i,".csv",sep=""), quote = FALSE, sep="\t")
}
return(filtMet)
}
#' Transform data matrix function
#'
#' This function transform data based on the type of transfromation
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' It can be one of the following: "none","abs","abs.alpha.power","mul","log.abs","log.abs.alpha.power"; none is the default parameter.
#' @param power is a numeric power to be used in the abs.alpha.power transformation. Default is 1.
#' @keywords data transformation
#' @export
#' @examples
#'
transform_data = function(X,type.transformation="abs.alpha.power", power=1, cost=3, bb = 2){
if(type.transformation=="none") return(X)
if(type.transformation=="abs") return(abs(X))
if(type.transformation=="mul") return(X * cost)
if(type.transformation=="log.abs") return(log(abs(X + 1e-300),base = bb))
if(type.transformation=="abs.alpha.power") return(abs(X)^power)
if(type.transformation=="log.abs.alpha.power") return(log(abs(X)^power + 1e-300, base=bb))
}
#' Evaluating QSAR regression model
#'
#' This function compute the internal and external evaluation metrics for a QSAR model
#' @import glmnet
#' @param bm is a model obtained from the RCVLassoPar function
#' @param X_train is the matrix of the training dataset with samples on the rows and features on the column. The X_train matrix has to be already transformed.
#' @param X_test is the matrix of the test dataset with samples on the rows and features on the column. The X_train matrix has to be already transformed.
#' @param y_train is the numeric vector of response variable for training samples
#' @param y_test is the numeric vector of response variable for test samples
#' @param inner.train.prop is the percentage of samples from the train test to be used as training set in the random-split method. Default value is 0.9
#' @param p_train is the percentage of samples to be used in the training set
#' @param nPermValm is the number of random split iteration to compute the validation metrics
#' @param nPermScr is the number of random split iteration to compute the y-scrambling test
#' @return a list with the following components:
#' \item{Metrics}{a dataframe with the internal and external metrics computed for the identified models}
#' \item{WP}{a list with percentage of applicability domain of training and test plot and an object to be plot with the williams plot function }
#' @keywords model evaluation
#' @export
#'
evaluating_model = function(bm, X_train, X_test, y_train, y_test, md_list, gene_list, inner.train.prop,nPermValm=25, nPermScr=25){
features = colnames(X_train)
mse_cv = bm$pred.mse
Perm_res = validation_metrics(y = y_train,X = X_train,train.prop=inner.train.prop,lambda = bm$opt.lambda,nPerm = nPermValm)
Y_scramble = random_split_Y_scramble(y = y_train,X = X_train,train.prop=inner.train.prop,lambda = bm$opt.lambda,nPerm = nPermScr)
finalModel = glmnet(x = X_train,y = y_train,lambda = bm$opt.lambda,intercept = TRUE)
y_pred_test = predict(finalModel,X_test)
y_pred_train = predict(finalModel,X_train)
print("Compute Tropsha Metrics")
c(trm1, k, trm2, k1, trm3) %<-% tropsha_metrics(predicted = y_pred_test,observed = y_test)
print("Compute Ojha Metrics")
c(rm2,dr2m) %<-% ojha_validation_metrics(X_train = X_train,X_test = X_test,y_train = y_train,y_test = y_test,lambda = finalModel$opt.lambda)
print("computed tropsha and oja metrics ")
beta = finalModel$beta[,1]
beta = beta[beta!=0]
if(length(beta)==0){
Metrics = data.frame(nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, rm2 = 0,dr2m = 0,trm1=0, k=0,trm2=0,k1=0,trm3=0,
R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = "", coefficients = "")
print("no rel beta!!!!")
return(list(Metrics=Metrics, WP=0,y_pred_test = y_pred_test, y_pred_train=y_pred_train))
}
mdidx = which(names(beta) %in% md_list)
geidx = which(names(beta) %in% gene_list)
MDs = names(beta)[mdidx]
nMD = length(MDs)
MDList = paste(MDs,collapse = ";")
GEs = names(beta)[geidx]
nGE = length(GEs)
GEList = paste(GEs,collapse = ";")
coeffList =paste(c(beta[mdidx],beta[geidx]),collapse = ";")
y_pred_test = predict(finalModel,X_test)
y_pred_train = predict(finalModel,X_train)
predictive_mse = sum( { y_test - y_pred_test }^2 )/length(y_pred_test)
print("computed predictive_mse")
R2_test = R2_func(observed = y_test, predicted = predict(finalModel,X_test))
R2_train = R2_func(observed = y_train, predicted = predict(finalModel,X_train))
WP = williams_plot(X_train = X_train,X_test = X_test, y_train = y_train,y_test = y_test, model = finalModel,beta = beta)
AD = WP$ADVal
print("computed WP")
names(AD) = c("AD_train","AD_test")
names(Perm_res) = c("Q2","CCC","Q2F1","Q2F2","Q2F3","rm2")
names(Y_scramble) = c("R2Yscr","Q2Yscr")
Metrics = data.frame(nMD = nMD, nGE = nGE,MSE_tr = mse_cv, MSE_te=predictive_mse,R2_tr = R2_train,R2_te = R2_test,
Q2 = Perm_res[1],CCC = Perm_res[2],Q2F1 = Perm_res[3],Q2F2 = Perm_res[4], Q2F3 = Perm_res[5], #rm2 = Perm_res[6],
rm2 = rm2,dr2m = dr2m,trm1=trm1, k=k,trm2=trm2,k1=k1,trm3=trm3,
R2Yscr= Y_scramble[1],Q2Yscr= Y_scramble[2],AD_train= AD[1],AD_test = AD[2], MDlist = as.character(MDList),GEList = as.character(GEList), Intercept = finalModel$a0, coefficients = coeffList)
rownames(Metrics) = NULL
toRet =list(Metrics=Metrics, WP=WP,y_pred_test = y_pred_test, y_pred_train=y_pred_train)
print(length(toRet))
return(list(Metrics=Metrics, WP=WP,y_pred_test = y_pred_test, y_pred_train=y_pred_train))
}
#' Filter models
#'
#' This function check if the identified models satisy the QSAR external and internal validation criteria
#' @param Info is a dataframe containing QSAR internal and external validation metric for each model
#' @return a filtered dataframe containing only the QSAR model that pass the filters. It returns -1 if no model satisfy the criteria.
#' @keywords model filtering
#' @export
#' @examples
#'
filter_metric_models = function(Info){
Info$Q2 = as.numeric(as.vector(Info$Q2))
Info$R2_tr = as.numeric(as.vector(Info$R2_tr))
Info$R2_te = as.numeric(as.vector(Info$R2_te))
Info$Q2F1 = as.numeric(as.vector(Info$Q2F1))
Info$Q2F2 = as.numeric(as.vector(Info$Q2F2))
Info$Q2F3 = as.numeric(as.vector(Info$Q2F3))
Info$MSE_tr = as.numeric(as.vector(Info$MSE_tr))
Info$MSE_te = as.numeric(as.vector(Info$MSE_te))
Info$CCC = as.numeric(as.vector(Info$CCC))
Info$rm2 = as.numeric(as.vector(Info$rm2))
Info$R2Yscr = as.numeric(as.vector(Info$R2Yscr))
Info$Q2Yscr = as.numeric(as.vector(Info$Q2Yscr))
idx1 = which(Info$R2_tr>0.6)
idx2 = which(Info$R2_te>0.6)
idx3 = which(Info$Q2>0.5)
idx4 = which(Info$Q2F1>0.6)
idx5 = which(Info$Q2F2>0.6)
idx6 = which(Info$Q2F3>0.6)
idx7 = which(Info$CCC>0.85)
idx8 = which(Info$rm2>0.5)
idx9 = which(Info$AD_test == 100)
idx = intersect(idx1,intersect(idx2,intersect(idx3, intersect(idx4, intersect(idx5, intersect(idx6,intersect(idx7,intersect(idx8,idx9))))))))
if(length(idx)==0){
print("No solution satisfy the metric criterias!")
return(-1)
}
if(length(idx)==1){
Info2= Info[idx,]
return(Info2)
} else{
Info2 = Info[idx,]
Info2 = Info2[order(Info2$R2_te,decreasing = T),]
return(Info2)
}
}
#' Function to create train/test split
#'
#' This function create a training-test split of the dataset
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param p_train is the percentage of samples to be used in the training set
#' @return a list with the following components:
#' \item{X_train}{a matrix with the training samples}
#' \item{X_test}{a matrix with the test samples}
#' \item{y_train}{the numeric vector of responses for the training samples}
#' \item{y_test}{the numeric vector of responses for the test samples}
#' @keywords training/test split
#' @export
#' @examples
#'
training_test_split = function(X,y,p_train){
train.index <- createDataPartition(y, p = p_train, list = FALSE)
X_train = X[train.index,]
X_test = X[-train.index,]
y_train = y[train.index]
y_test = y[-train.index]
test.index = 1:nrow(X)
test.index = test.index[-train.index]
return(list(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test,train.index=train.index,test.index=test.index))
}
training_test_split_2 = function(X,y,p_train){
quantile(y)
idx = which(y>= quantile(y)[2] & y<=quantile(y)[4])
nSamples = nrow(X) * (1-p_train)
test.idx = sample(x = idx,size = round(nSamples))
X_train = X[-test.idx,]
X_test = X[test.idx,]
y_train = y[-test.idx]
y_test = y[test.idx]
return(list(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test,test.idx=test.idx))
}
#' Function to compute the lambda grid values
#'
#' This function computes a grid of lambda values for the LASSO analysis
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param nlambda is the nuber of lambda to generatea
#' @return a numeric vector of length nlabmda
#' @keywords internal
#' @export
#' @examples
#'
lambda.grid <- function(y, x, nlambda = 100){
n <- length(y)
J <- ncol(x)
## Generate the initial sequence
sx <- scale(x)
sx[is.na(sx)] = 0
sy <- as.vector(scale(y))
u <- rep(0, times = J)
for (j in 1:J){
u[j] <- abs( sum(sy * sx[,j]) )
}
lmax <- max(u) / n
l <- seq(log(lmax/1000), log(lmax), length = nlambda )
l <- exp(l)
return(l)
}
#' Function to compute the mean
#'
#' This function computes the mean of each column of a matrix by taking into account only non zero values
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @return a numeric vector of column means
#' @keywords internal
#' @export
#' @examples
#'
compute_mean = function(X){
avg = apply(X, 2, FUN <- function(variables) {
idx = which(variables!=0)
if(length(idx)==0){
return(0)
}else{
return(mean(variables[idx],na.rm=TRUE))
}
})
return(avg)
}
#' Function to plot the results of the random split cross validation
#'
#' This function plot an object of class RCVLasso
#' @importFrom graphics abline arrows axis legend lines plot points
#' @param obj is the object to be plotted
#' @keywords plot
#' @export
#' @examples
#'
plot.RCVLasso <- function(obj){
n.splits <- nrow(obj$cv$mse)
lambda <- obj$lambda
if(obj$measure == 'R2'){
avg <- apply(obj$cv$R2, 2, mean, na.rm = TRUE)
se <- apply(obj$cv$R2, 2, sd , na.rm = TRUE) / sqrt(n.splits)
uci <- avg + 1.96 * se
lci <- avg - 1.96 * se
idx_star <- which.max(avg)
idx_star_u <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
idx_star_l <- min(which(lambda <= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}else{
avg <- apply(obj$cv$mse, 2, mean, na.rm = TRUE)
se <- apply(obj$cv$mse, 2, sd , na.rm = TRUE) / sqrt(n.splits)
uci <- avg + 1.96 * se
lci <- avg - 1.96 * se
idx_star <- which.min(avg)
idx_star_l <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
idx_star_u <- min(which(lambda <= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}
plot(log(lambda) , avg, t='n', ylim=range(uci, lci),
ylab=paste('Predictive ',obj$measure,sep=''), xlab='log(lambda)')
arrows(log(lambda), lci, log(lambda), uci,
length=0.075, angle=90, code=3, col='grey', lwd=1.2)
lines(log(lambda), avg, t='p', pch=20, col='red', cex=1)
abline(v = log(lambda)[idx_star], lty = 2, col = 3)
abline(v = log(lambda)[idx_star_u], lty = 2, col = 3)
abline(v = log(lambda)[idx_star_l], lty = 2, col = 3)
axis(side=3, at=log(lambda[c(idx_star, idx_star_u,idx_star_l)]),
labels = round(apply(obj$cv$active.size, 2, mean, na.rm = TRUE)[c(idx_star, idx_star_u,idx_star_l)]))
}
#' Function to compute the active size for each lambda
#'
#' Given the results of the random splits method this function computes the active size for each lambda
#' @param obj an object of class RCVLasso
#' @return a vector of active sizes of the same length of the number of lambda used to perform the random split method
#' @keywords internal
#' @export
#' @examples
#'
compute_active_sizes = function(obj){
n.splits <- nrow(obj$cv$mse)
lambda <- obj$lambda
if(obj$measure == 'R2'){
avg <- apply(obj$cv$R2, 2, mean, na.rm = TRUE)
se <- apply(obj$cv$R2, 2, sd , na.rm = TRUE) / sqrt(n.splits)
uci <- avg + 1.96 * se
lci <- avg - 1.96 * se
idx_star <- which.max(avg)
idx_star_95 <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}else{
avg <- apply(obj$cv$mse, 2, mean, na.rm = TRUE)
se <- apply(obj$cv$mse, 2, sd , na.rm = TRUE) / sqrt(n.splits)
uci <- avg + 1.96 * se
lci <- avg - 1.96 * se
idx_star <- which.min(avg)
idx_star_95 <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}
xx = round(apply(obj$cv$active.size, 2, mean, na.rm = TRUE)[c(idx_star, idx_star_95)])
return(xx)
}
#' Function to combine the results of the parallel random split method
#'
#' Given the results of the random splits method this function computes the active size for each lambda
#' @param x is the output iniziatilization, if present in the .init parameter of foreach
#' @param ... values returned by the foreach function
#' @return a list of combined objects
#' @keywords internal
#' @export
#' @examples
#'
comb = function(x, ...) {
# create a list with all the first results of the foreach iteration
y1 = lapply(list(...), function(y) y[[1]])
# create a matrix from this list
x1 = matrix(unlist(y1),byrow = T,ncol = length(y1[[1]]))
y2 = lapply(list(...), function(y) y[[2]])
x2 = matrix(unlist(y2),byrow = T,ncol = length(y2[[1]]))
y3 = lapply(list(...), function(y) y[[3]])
#x3 = do.call(rbind, y3)
x3 = matrix(unlist(y3),byrow = T,ncol = length(y3[[1]]))
y4 = lapply(list(...), function(y) y[[4]])
list(x1,x2,x3,y4[[length(y4)]])
}
#' Function to perform the random split analysis
#'
#' This function perform the random split analysis method to estimate the optimal lambda shrinkage parameter for the LASSO model
#' @import glmnet
#' @import doParallel
#' @import foreach
#' @param x is the matrix of the input dataset with samples on the rows and features on the columns
#' @param y is the vector of response variables with same length of the number of samples
#' @param lambda is a numeric vector of lambda value to be used in the LASSO parameter selection step
#' @param n.splits is the number of random split to be performed. Default value is 25
#' @param train.prop is the percentage of samples in the dataset to be used as training set. Default value is 0.9
#' @param measure is the measure used to perform the choice of the optimal lambda value. Possible values are MSE and R2. Default value is MSE
#' @param intercept is a boolean valus indicating if we want to fit or not the intercept. Default valuw is TRUE
#' @param type.solution is a string indicating the type of solution to compute. Possible values are: min, uci and lci; if standard min or max of the average CV function.
#' if lci, take the the most parsimonous solution within the tot percentage of confidence bands around the standard solution. if uci a less partimosious solution within the tot percentage of confidence bands around the standard solution is selected
#' @param nCores is the number of cores to be used
#' @param th is the size of the confidence interval. Default value is 1.96
#' @return an object of class RCVLasso containing the following objects:
#' \item{cv}{list with matrices (of sizes n.slipts x n.lambda) containing statistics for each lambda and each splitting.
#' mse is the matrix of predictive mse. R2 is the matrix of predictive R2 and active.size is the matrix with active beta in the trained model for each lambda and for each split.}
#' \item{lambda}{numeric vector of the lambda values taken in input}
#' \item{fitted}{predicted values}
#' \item{residuals}{differences between predicted and real values}
#' \item{intercept}{if the input parameter intercept is TRUE, it is the numeric value of the fitted intercep, otherwise is zero}
#' \item{beta}{beta coefficients of the fitted model}
#' \item{opt.lambda}{optimal lambda value}
#' \item{idx.support}{index of the optimal lambda value}
#' \item{pred.R2}{R2 of the optimal model on the test sets}
#' \item{pred.mse}{mse of the optimal model on the test sets}
#' \item{mse}{mse of the optimal model on the training sets}
#' \item{R2}{R2 of the optimal model on the training sets}
#' \item{measure}{measure used to compute the optimal solution}
#' \item{fitted_model}{model fitted on the whole data}
#' @keywords random split lasso
#' @export
RCVLassoPar = function(y , x , lambda , n.splits = 25 , train.prop = 0.9,measure = 'MSE',
intercept = TRUE , type.solution = 'lci',nCores = 20,th = 1.96){
cl = makeCluster(nCores)
registerDoParallel(cl)
n <- length(y)
ntrain <- ceiling(train.prop * n)
ntest <- n-ntrain
nlambda <- length(lambda)
print("starting foreach...")
oper <- foreach(s=1:n.splits,.combine=comb,.multicombine=TRUE,
.init=list(c(), c(),c())) %dopar% {
CVmse <- CVR2 <- CVnpreds <- rep(0, nlambda)
## Sample training set
idx_train <- sample(1:n, size = ntrain, replace = FALSE)
## Cycle over tuning parameters
for (l in 1:nlambda){
## dummy initial containers
fit <- NULL
yhat <- rep(NA, times = ntest)
RSS <- TSS <- 0
## Try to perform the LASSO
try({
fit <- glmnet(y = y[idx_train], x = x[idx_train, ],
lambda = lambda[l] , intercept = intercept)
fit$beta <- as.numeric(fit$beta)
}, silent = TRUE)
if(!is.null(fit)){
## Select relavent predictors
idx_relevant <- which(fit$beta !=0)
nRel <- length(idx_relevant)
CVnpreds[l] = nRel
if( nRel >= 2){
if (intercept){
if(fit$a0!=0){
yhat <- cbind(1,x[-idx_train, ]) %*% c(fit$a0, fit$beta)
}
}else{
yhat <- x[-idx_train, ] %*% fit$beta
}
yhat <- as.numeric(yhat)
RSS <- sum( { y[-idx_train] - yhat }^2 )
TSS <- sum( { y[-idx_train] - mean(y[-idx_train]) }^2 )
CVmse[l] = RSS / ntest
r2 = 1 - RSS / TSS
if(r2<0){
CVR2[l] = 0
}else{
CVR2[l] = r2
}
}else{
## If none of the regressors is selected we consider the constant model
## y = mean(y) + error
RSS <- sum( { y[-idx_train] - mean(y[-idx_train]) }^2 )
CVmse[l] = RSS / ntest
CVR2[l] = 0
}
} ## END: if(!is.null(fit))
} ## ENDS for for (l in 1:nlambda)
toRet =list(CVmse=CVmse,CVR2=CVR2,CVnpreds=CVnpreds,idx_train = idx_train)
return(toRet)
}## ENDS foreach (s in 1:n.splits)
print("end foreach...")
stopCluster(cl)
CVmse = oper[[1]]
CVR2 = oper[[2]]
CVnpreds = oper[[3]]
idx_train = oper[[4]]
print("check measure...")
## Select the final lambda
if(measure=='R2'){
avg <- apply(CVR2, 2, mean, na.rm=TRUE)
se <- apply(CVR2, 2, sd, na.rm=TRUE) / sqrt(n.splits)
uci <- avg + th * se
lci <- avg - th * se
if(type.solution=='min'){
# print('min')
idx_best <- which.max(avg)
}else{
# print('ci')
idx_star <- which.max(avg)
idx_best <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}
}
if(measure=='MSE'){
# print('mse')
avg <- apply(CVmse, 2, mean, na.rm=TRUE)
se <- apply(CVmse, 2, sd, na.rm=TRUE) / sqrt(n.splits)
uci <- avg + th * se
lci <- avg - th * se
if(type.solution=='min'){
# print('min')
idx_best <- which.min(avg)
}
if(type.solution=="uci"){ #more features
# print('ci')
idx_star <- which.min(avg)
idx_best <- min(which(lambda <= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}
if(type.solution=="lci"){ #less features
idx_star <- which.min(avg)
idx_best <- max(which(lambda >= lambda[idx_star] & { avg <= uci[idx_star] & avg >= lci[idx_star] }))
}
}
## Fit the optimal lasso on the entire sample
##
print("fit optimal lasso...")
lstar <- lambda[idx_best]
print(paste("LStar:", lstar))
fit <- glmnet(y = y, x = x, lambda = lstar , intercept = intercept)
fit$beta <- as.numeric(fit$beta)
idx_relevant <- which(fit$beta !=0)
# print(dim(CVnpreds))
# print(dim(CVmse))
# print(dim(CVmse))
#
#
# print(n.splits)
# print(nlambda)
print("assigning idx relevant")
CVnpreds[nrow(CVnpreds),nlambda] <- length(idx_relevant)
print("assigned idx relevant")
if( length(idx_relevant) >= 1){
if (intercept){
if(fit$a0!=0){
yhat <- cbind(1,x) %*% c(fit$a0, fit$beta)
}
}else{
yhat <- x %*% fit$beta
}
yhat <- as.numeric(yhat)
RSS <- sum( { y - yhat }^2 )
TSS <- sum( { y - mean(y) }^2 )
R2 <- 1 - (RSS / TSS)
}else{
## If none of the regressors is selected we consider the constant model
## y = mean(y) + error
yhat <- rep(mean(y), n)
RSS <- sum( { y[-idx_train] - mean(y[-idx_train]) }^2 )
R2 <- 0
}
## Output
ans <- list()
ans$cv <- list(mse = CVmse, R2 = CVR2, active.size = CVnpreds)
ans$lambda <- lambda
ans$fitted <- yhat
ans$residuals <- as.numeric(y-yhat)
if(intercept){
ans$intercept <- fit$a0
}else{
ans$intercept <- 0
}
ans$beta <- fit$beta
ans$opt.lambda <- lstar
ans$idx.support <- idx_relevant
#ans$pred.R2 <- avg[idx_best]
ans$pred.R2 <- apply(CVR2, 2, mean, na.rm=TRUE)[idx_best]
ans$pred.mse <- apply(CVmse, 2, mean, na.rm=TRUE)[idx_best]
ans$mse <- RSS/n
ans$R2 <- R2
ans$measure = measure
ans$fitted_model = fit
class(ans) <- 'RCVLasso'
return(ans)
}
plot_pred_real = function(pred_train, pred_test, train_class, test_class, xlab, ylab){
min_x = min(pred_train,pred_test)
max_x = max(pred_train,pred_test)
min_y = min(train_class,test_class)
max_y = max(train_class,test_class)
train_tp = cbind(pred_train, train_class)
plot(pred_train,train_class, xlim = c(min_y,max_y),ylim = c(min_y,max_y), xlab = xlab, ylab = ylab)
points(pred_test,test_class,col="red")
lines(x = c(min_y,max_y), y = c(min_y,max_y))
#text(x = 1,y=-1,labels = paste("MSE: ",round(sum( { train_class - train_tp[,1] }^2 )/nrow(train_tp),2),sep=""))
legend(x = "bottomright",legend = c("Train","Test"),fill = c("black","red"),bty = "n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.