R/model_functions.R

Defines functions plot_MD_GE_graph build_predicted_results transform_data evaluating_model filter_metric_models training_test_split training_test_split_2 lambda.grid compute_mean plot.RCVLasso compute_active_sizes comb RCVLassoPar plot_pred_real

Documented in build_predicted_results comb compute_active_sizes compute_mean evaluating_model filter_metric_models lambda.grid plot_MD_GE_graph plot.RCVLasso RCVLassoPar training_test_split transform_data

#' Create molecular descriptors and MoA features interaction graph
#'
#' This function takes in input one row of the metrics matrix returned by the tqsar and hqsar functions and plot the interaction graph of genes and MDs
#' @param model_info is one row of the metrics matrix
#' @param MDMat is the matrix of modelucalr descriptors
#' @param GeneMat is the matrix of MOA features
#' @param neg_th is a threshold that the user can set up to filter the graph from negative endges. If not null, the plot will contain only the edges with value lower that the neg_th quantile of the distribution of the negative edges weights
#' @param pos_th is a threshold that the user can set up to filter the graph from negative endges. If not null, the plot will contain only the edges with value higher that the pos_th quantile of the distribution of the positive edges weights
#' @keywords plot interaction graph
#' @export
#' @examples
#'
plot_MD_GE_graph = function(model_info, MDMat, GeneMat, neg_th = NULL, pos_th = NULL){
  # model_info = abs.alpha.power.res.filtered.metrics["29",]
  # MDMat = rbind(X_md[train.idx,],X_md[test.idx,])
  # GeneMat = rbind(X_ge[train.idx,],X_ge[test.idx,])

  coeff = round(as.numeric(unlist(strsplit(as.character(model_info[1,"coefficients"]),split=";"))),4)
  MDs = unlist(strsplit(as.character(model_info[1,"MDlist"]),split = ";"))
  GEs = unlist(strsplit(as.character(model_info[1,"GEList"]),split = ";"))

  nodes = c(MDs, GEs)
  M = matrix(0, nrow = length(nodes), ncol = length(nodes))
  colnames(M) = rownames(M) = nodes

  for(i in nodes){
    if(i %in% colnames(MDMat)){
      iv = MDMat[,i]
    }else{
      iv = GeneMat[,i]
    }
    for(j in nodes){
      if(j %in% colnames(MDMat)){
        jv = MDMat[,j]
      }else{
        jv = GeneMat[,j]
      }

      M[i,j] = M[j,i] = cor(iv, jv)

    }
  }

  diag(M) = 0

  if(!is.null(neg_th) & is.null(pos_th)){
    neg_q = quantile(M[M<0], neg_th)
    M[ M > neg_q & M < 0] = 0
  }
  if(!is.null(pos_th) & is.null(neg_th)){
    pos_q = quantile(M[M>=0], pos_th)
    M[ M < pos_q & M > 0] = 0
  }
  if(!is.null(pos_th) & !is.null(neg_th)){
    neg_q = quantile(M[M<0], neg_th)
    pos_q = quantile(M[M>=0], pos_th)
    M[ M > neg_q & M< pos_q] = 0
  }

  g = igraph::graph.adjacency(M, weighted = T, mode = "undirected")

  vcol = rep( adjustcolor( "red", alpha.f = 0.1),nrow(M)) #rep("pink",6)
  names(vcol) = rownames(M)
  vcol[MDs] = adjustcolor( "blue", alpha.f = 0.1)

  v.lab.col = rep( "darkred",nrow(M)) #rep("pink",6)
  names(v.lab.col) = rownames(M)
  v.lab.col[MDs] = "darkblue"

  ecol = rep("gold",length(igraph::E(g)$weight))
  ecol[sign((igraph::E(g)$weight))<0] = "darkgreen"

  par(mar = c(0,2,2,0))
  igraph::plot.igraph(g, vertex.color = vcol, vertex.label.cex = 0.8, vertex.frame.color = vcol,
       edge.width = abs(igraph::E(g)$weight * 2), vertex.label.color =v.lab.col,
       edge.color = ecol,
       layout = igraph::layout.star)
  legend("topleft",ncol = 2,  bty = "n",cex = 0.8,
         legend = c("Pos. Beta","Neg. Beta","Pos. Cor","Neg. Cor", "MDs","Genes"),
         fill = c("lightblue","pink","gold","darkgreen", "darkblue","darkred"))


}

#' Save prediction results in a predefined folder
#'
#' This function takes in input the results of the analysis and plot the williams plot, the scatterplot of experimental versus predicted values as a pdf and store the predicted values in a csv file
#' @param res is an object of hqsar class (resulting from the hqsar function)
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' @param xlab is the text for the x-axis
#' @param ylab is the text for the y-axis
#' @param start_path is the path of the folder where to store the files
#' @keywords save results
#' @export
#' @examples
#'
build_predicted_results = function(res, type.transformation, xlab = "Experimental HSB", ylab="Predicted HSB", start_path = "/home/MOKA/Angela/angela/hQSAR_test/res/predictions/"){

  filtMet = filter_metric_models(Info = res$Metrics)
  write.table(res$Metrics,file = paste(start_path,type.transformation,"_unfiltered_metrics.csv",sep=""), quote = FALSE, sep="\t", row.names = TRUE, col.names = TRUE)
  write.table(filtMet,file = paste(start_path,type.transformation,"_filtered_metrics.csv",sep=""), quote = FALSE, sep="\t", row.names = TRUE, col.names = TRUE)


  nres = length(res$pred_tr)
  for(i in 1:nres){
    pdf(file = paste(start_path,type.transformation,"_WP_",i,".pdf",sep=""))
    plot_wp(res$williams_plots[[i]]$DTP)
    dev.off()


    pdf(file = paste(start_path,type.transformation,"_experimental_predicted_",i,".pdf",sep=""))
    hyQSAR:::plot_pred_real(pred_train=res$pred_tr[[i]],
                   pred_test=res$pred_te[[i]],
                   train_class = res$y_train, test_class=res$y_test, xlab, ylab)
    dev.off()

    PredVal = cbind(rbind(cbind(res$pred_tr[[i]], res$y_train, abs(res$pred_tr[[i]] - res$y_train)),
                          cbind(res$pred_te[[i]], res$y_test, abs(res$pred_te[[i]] - res$y_test))),
                    c(rep("Train", length(res$y_train)),rep("Test", length(res$y_test))))

    colnames(PredVal) = c("Predicted","Experimental","Resuidusl","Train/Test")

    write.table(PredVal,file = paste(start_path,type.transformation,"_exp_vs_pred_",i,".csv",sep=""), quote = FALSE, sep="\t")

  }
  return(filtMet)
}


#' Transform data matrix function
#'
#' This function transform data based on the type of transfromation
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' It can be one of the following: "none","abs","abs.alpha.power","mul","log.abs","log.abs.alpha.power"; none is the default parameter.
#' @param power is a numeric power to be used in the abs.alpha.power transformation. Default is 1.
#' @keywords data transformation
#' @export
#' @examples
#'
transform_data = function(X,type.transformation="abs.alpha.power", power=1, cost=3, bb = 2){
  if(type.transformation=="none") return(X)
  if(type.transformation=="abs") return(abs(X))
  if(type.transformation=="mul") return(X * cost)
  if(type.transformation=="log.abs") return(log(abs(X + 1e-300),base = bb))
  if(type.transformation=="abs.alpha.power") return(abs(X)^power)
  if(type.transformation=="log.abs.alpha.power") return(log(abs(X)^power + 1e-300, base=bb))

}

#' Evaluating QSAR regression model
#'
#' This function compute the internal and external evaluation metrics for a QSAR model
#' @import glmnet
#' @param bm is a model obtained from the RCVLassoPar function
#' @param X_train is the matrix of the training dataset with samples on the rows and features on the column. The X_train matrix has to be already transformed.
#' @param X_test is the matrix of the test dataset with samples on the rows and features on the column. The X_train matrix has to be already transformed.
#' @param y_train is the numeric vector of response variable for training samples
#' @param y_test is the numeric vector of response variable for test samples
#' @param inner.train.prop is the percentage of samples from the train test to be used as training set in the random-split method. Default value is 0.9
#' @param p_train is the percentage of samples to be used in the training set
#' @param nPermValm is the number of random split iteration to compute the validation metrics
#' @param nPermScr is the number of random split iteration to compute the y-scrambling test
#' @return a list with the following components:
#' \item{Metrics}{a dataframe with the internal and external metrics computed for the identified models}
#' \item{WP}{a list with percentage of applicability domain of training and test plot and an object to be plot with the williams plot function }
#' @keywords model evaluation
#' @export
#'

evaluating_model = function(bm, X_train, X_test, y_train, y_test, md_list, gene_list, inner.train.prop,nPermValm=25, nPermScr=25){

  features = colnames(X_train)
  mse_cv = bm$pred.mse

  Perm_res = validation_metrics(y = y_train,X = X_train,train.prop=inner.train.prop,lambda = bm$opt.lambda,nPerm = nPermValm)
  Y_scramble = random_split_Y_scramble(y = y_train,X = X_train,train.prop=inner.train.prop,lambda = bm$opt.lambda,nPerm = nPermScr)

  finalModel = glmnet(x = X_train,y = y_train,lambda = bm$opt.lambda,intercept = TRUE)
  y_pred_test = predict(finalModel,X_test)
  y_pred_train = predict(finalModel,X_train)

  print("Compute Tropsha Metrics")
  c(trm1, k, trm2, k1, trm3)  %<-%  tropsha_metrics(predicted = y_pred_test,observed = y_test)
  print("Compute Ojha Metrics")
  c(rm2,dr2m)  %<-% ojha_validation_metrics(X_train = X_train,X_test = X_test,y_train = y_train,y_test = y_test,lambda = finalModel$opt.lambda)

  print("computed tropsha and oja metrics ")

  beta = finalModel$beta[,1]
  beta = beta[beta!=0]

  if(length(beta)==0){
    Metrics = data.frame(nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
                         Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, rm2  = 0,dr2m = 0,trm1=0, k=0,trm2=0,k1=0,trm3=0,
                         R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = "", coefficients = "")
    print("no rel beta!!!!")
    return(list(Metrics=Metrics, WP=0,y_pred_test = y_pred_test, y_pred_train=y_pred_train))
  }

  mdidx = which(names(beta) %in% md_list)
  geidx = which(names(beta) %in% gene_list)

  MDs = names(beta)[mdidx]
  nMD = length(MDs)
  MDList = paste(MDs,collapse = ";")

  GEs = names(beta)[geidx]
  nGE = length(GEs)
  GEList = paste(GEs,collapse = ";")

  coeffList =paste(c(beta[mdidx],beta[geidx]),collapse = ";")

  y_pred_test = predict(finalModel,X_test)
  y_pred_train = predict(finalModel,X_train)
  predictive_mse = sum( { y_test  -  y_pred_test }^2 )/length(y_pred_test)

  print("computed predictive_mse")

  R2_test = R2_func(observed = y_test, predicted = predict(finalModel,X_test))
  R2_train = R2_func(observed = y_train, predicted = predict(finalModel,X_train))

  WP = williams_plot(X_train = X_train,X_test =  X_test, y_train = y_train,y_test =  y_test, model = finalModel,beta = beta)
  AD = WP$ADVal
  print("computed WP")

  names(AD) = c("AD_train","AD_test")
  names(Perm_res) = c("Q2","CCC","Q2F1","Q2F2","Q2F3","rm2")
  names(Y_scramble) = c("R2Yscr","Q2Yscr")

  Metrics = data.frame(nMD = nMD, nGE = nGE,MSE_tr = mse_cv, MSE_te=predictive_mse,R2_tr = R2_train,R2_te = R2_test,
                       Q2 = Perm_res[1],CCC = Perm_res[2],Q2F1 = Perm_res[3],Q2F2 = Perm_res[4], Q2F3 = Perm_res[5], #rm2  = Perm_res[6],
                       rm2  = rm2,dr2m = dr2m,trm1=trm1, k=k,trm2=trm2,k1=k1,trm3=trm3,
                       R2Yscr= Y_scramble[1],Q2Yscr= Y_scramble[2],AD_train= AD[1],AD_test = AD[2], MDlist = as.character(MDList),GEList = as.character(GEList), Intercept = finalModel$a0, coefficients = coeffList)

  rownames(Metrics) = NULL
  toRet =list(Metrics=Metrics, WP=WP,y_pred_test = y_pred_test, y_pred_train=y_pred_train)
  print(length(toRet))



  return(list(Metrics=Metrics, WP=WP,y_pred_test = y_pred_test, y_pred_train=y_pred_train))
}

#' Filter models
#'
#' This function check if the identified models satisy the QSAR external and internal validation criteria
#' @param Info is a dataframe containing QSAR internal and external validation metric for each model
#' @return a filtered dataframe containing only the QSAR model that pass the filters. It returns -1 if no model satisfy the criteria.
#' @keywords model filtering
#' @export
#' @examples
#'
filter_metric_models = function(Info){

  Info$Q2 = as.numeric(as.vector(Info$Q2))
  Info$R2_tr = as.numeric(as.vector(Info$R2_tr))
  Info$R2_te = as.numeric(as.vector(Info$R2_te))
  Info$Q2F1 = as.numeric(as.vector(Info$Q2F1))
  Info$Q2F2 = as.numeric(as.vector(Info$Q2F2))
  Info$Q2F3 = as.numeric(as.vector(Info$Q2F3))
  Info$MSE_tr = as.numeric(as.vector(Info$MSE_tr))
  Info$MSE_te = as.numeric(as.vector(Info$MSE_te))
  Info$CCC = as.numeric(as.vector(Info$CCC))
  Info$rm2 = as.numeric(as.vector(Info$rm2))
  Info$R2Yscr = as.numeric(as.vector(Info$R2Yscr))
  Info$Q2Yscr = as.numeric(as.vector(Info$Q2Yscr))

  idx1 = which(Info$R2_tr>0.6)
  idx2 = which(Info$R2_te>0.6)
  idx3 = which(Info$Q2>0.5)
  idx4 = which(Info$Q2F1>0.6)
  idx5 = which(Info$Q2F2>0.6)
  idx6 = which(Info$Q2F3>0.6)
  idx7 = which(Info$CCC>0.85)
  idx8 = which(Info$rm2>0.5)
  idx9 = which(Info$AD_test == 100)

  idx = intersect(idx1,intersect(idx2,intersect(idx3, intersect(idx4, intersect(idx5, intersect(idx6,intersect(idx7,intersect(idx8,idx9))))))))

  if(length(idx)==0){
    print("No solution satisfy the metric criterias!")
    return(-1)
  }
  if(length(idx)==1){
    Info2= Info[idx,]
    return(Info2)
  }  else{
    Info2 = Info[idx,]
    Info2 = Info2[order(Info2$R2_te,decreasing = T),]
    return(Info2)
  }
}

#' Function to create train/test split
#'
#' This function create a training-test split of the dataset
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param p_train is the percentage of samples to be used in the training set
#' @return a list with the following components:
#' \item{X_train}{a matrix with the training samples}
#' \item{X_test}{a matrix with the test samples}
#' \item{y_train}{the numeric vector of responses for the training samples}
#' \item{y_test}{the numeric vector of responses for the test samples}
#' @keywords training/test split
#' @export
#' @examples
#'

training_test_split = function(X,y,p_train){
  train.index <- createDataPartition(y, p = p_train, list = FALSE)

  X_train = X[train.index,]
  X_test = X[-train.index,]

  y_train = y[train.index]
  y_test = y[-train.index]

  test.index = 1:nrow(X)
  test.index = test.index[-train.index]

  return(list(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test,train.index=train.index,test.index=test.index))
}

training_test_split_2 = function(X,y,p_train){
  quantile(y)
  idx = which(y>= quantile(y)[2] & y<=quantile(y)[4])
  nSamples = nrow(X) * (1-p_train)
  test.idx = sample(x = idx,size = round(nSamples))

  X_train = X[-test.idx,]
  X_test = X[test.idx,]

  y_train = y[-test.idx]
  y_test = y[test.idx]
  return(list(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test,test.idx=test.idx))
}


#' Function to compute the lambda grid values
#'
#' This function computes a grid of lambda values for the LASSO analysis
#' @param y is the reponse vector of the training set
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @param nlambda is the nuber of lambda to generatea
#' @return a numeric vector of length nlabmda
#' @keywords internal
#' @export
#' @examples
#'

lambda.grid <- function(y, x, nlambda = 100){

  n    <- length(y)
  J    <- ncol(x)

  ## Generate the initial sequence
  sx   <- scale(x)
  sx[is.na(sx)] = 0
  sy   <- as.vector(scale(y))

  u    <- rep(0, times = J)
  for (j in 1:J){
    u[j] <- abs( sum(sy * sx[,j]) )
  }

  lmax <- max(u) / n
  l <- seq(log(lmax/1000), log(lmax), length = nlambda )
  l <- exp(l)

  return(l)
}

#' Function to compute the mean
#'
#' This function computes the mean of each column of a matrix by taking into account only non zero values
#' @param X is the matrix of the dataset with samples on the rows and features on the column
#' @return a numeric vector of column means
#' @keywords internal
#' @export
#' @examples
#'
compute_mean = function(X){
  avg = apply(X, 2, FUN <- function(variables) {
    idx = which(variables!=0)
    if(length(idx)==0){
      return(0)
    }else{
      return(mean(variables[idx],na.rm=TRUE))
    }
  })
  return(avg)
}

#' Function to plot the results of the random split cross validation
#'
#' This function plot an object of class RCVLasso
#' @importFrom graphics abline arrows axis legend lines plot points
#' @param obj is the object to be plotted
#' @keywords plot
#' @export
#' @examples
#'
plot.RCVLasso <- function(obj){
  n.splits <- nrow(obj$cv$mse)
  lambda   <- obj$lambda

  if(obj$measure == 'R2'){
    avg      <- apply(obj$cv$R2, 2, mean, na.rm = TRUE)
    se       <- apply(obj$cv$R2, 2, sd , na.rm = TRUE) / sqrt(n.splits)
    uci      <- avg + 1.96 * se
    lci      <- avg - 1.96 * se

    idx_star    <- which.max(avg)
    idx_star_u <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))
    idx_star_l <- min(which(lambda <= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))

  }else{
    avg      <- apply(obj$cv$mse, 2, mean, na.rm = TRUE)
    se       <- apply(obj$cv$mse, 2, sd , na.rm = TRUE) / sqrt(n.splits)
    uci      <- avg + 1.96 * se
    lci      <- avg - 1.96 * se

    idx_star    <- which.min(avg)
    idx_star_l <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))
    idx_star_u <- min(which(lambda <= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))

  }

  plot(log(lambda) , avg, t='n', ylim=range(uci, lci),
       ylab=paste('Predictive ',obj$measure,sep=''), xlab='log(lambda)')
  arrows(log(lambda), lci, log(lambda), uci,
         length=0.075, angle=90, code=3, col='grey', lwd=1.2)
  lines(log(lambda), avg, t='p', pch=20, col='red', cex=1)
  abline(v = log(lambda)[idx_star], lty = 2, col = 3)
  abline(v = log(lambda)[idx_star_u], lty = 2, col = 3)
  abline(v = log(lambda)[idx_star_l], lty = 2, col = 3)

  axis(side=3, at=log(lambda[c(idx_star, idx_star_u,idx_star_l)]),
       labels = round(apply(obj$cv$active.size, 2, mean, na.rm = TRUE)[c(idx_star, idx_star_u,idx_star_l)]))

}

#' Function to compute the active size for each lambda
#'
#' Given the results of the  random splits method this function computes the active size for each lambda
#' @param obj an object of class RCVLasso
#' @return a vector of active sizes of the same length of the number of lambda used to perform the random split method
#' @keywords internal
#' @export
#' @examples
#'
compute_active_sizes = function(obj){

  n.splits <- nrow(obj$cv$mse)
  lambda   <- obj$lambda

  if(obj$measure == 'R2'){
    avg      <- apply(obj$cv$R2, 2, mean, na.rm = TRUE)
    se       <- apply(obj$cv$R2, 2, sd , na.rm = TRUE) / sqrt(n.splits)
    uci      <- avg + 1.96 * se
    lci      <- avg - 1.96 * se

    idx_star    <- which.max(avg)
    idx_star_95 <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))

  }else{
    avg      <- apply(obj$cv$mse, 2, mean, na.rm = TRUE)
    se       <- apply(obj$cv$mse, 2, sd , na.rm = TRUE) / sqrt(n.splits)
    uci      <- avg + 1.96 * se
    lci      <- avg - 1.96 * se

    idx_star    <- which.min(avg)
    idx_star_95 <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))

  }
  xx = round(apply(obj$cv$active.size, 2, mean, na.rm = TRUE)[c(idx_star, idx_star_95)])
  return(xx)
}

#' Function to combine the results of the parallel random split method
#'
#' Given the results of the  random splits method this function computes the active size for each lambda
#' @param x is the output iniziatilization, if present in the .init parameter of foreach
#' @param ... values returned by the foreach function
#' @return a list of combined objects
#' @keywords internal
#' @export
#' @examples
#'

comb = function(x, ...) {
  # create a list with all the first results of the foreach iteration
  y1 = lapply(list(...), function(y) y[[1]])
  # create a matrix from this list
  x1 = matrix(unlist(y1),byrow = T,ncol = length(y1[[1]]))

  y2 = lapply(list(...), function(y) y[[2]])
  x2 = matrix(unlist(y2),byrow = T,ncol = length(y2[[1]]))

  y3 = lapply(list(...), function(y) y[[3]])
  #x3 =     do.call(rbind, y3)
  x3 = matrix(unlist(y3),byrow = T,ncol = length(y3[[1]]))

  y4 = lapply(list(...), function(y) y[[4]])
  list(x1,x2,x3,y4[[length(y4)]])

}

#' Function to perform the random split analysis
#'
#' This function perform the random split analysis method to estimate the optimal lambda shrinkage parameter for the LASSO model
#' @import glmnet
#' @import doParallel
#' @import foreach
#' @param x is the matrix of the input dataset with samples on the rows and features on the columns
#' @param y is the vector of response variables with same length of the number of samples
#' @param lambda is a numeric vector of lambda value to be used in the LASSO parameter selection step
#' @param n.splits is the number of random split to be performed. Default value is 25
#' @param train.prop is the percentage of samples in the dataset to be used as training set. Default value is 0.9
#' @param measure is the measure used to perform the choice of the optimal lambda value. Possible values are MSE and R2. Default value is MSE
#' @param intercept is a boolean valus indicating if we want to fit or not the intercept. Default valuw is TRUE
#' @param type.solution is a string indicating the type of solution to compute. Possible values are: min, uci and lci; if standard min or max of the average CV function.
#' if lci, take the the most parsimonous solution within the tot percentage of confidence bands around the standard solution. if uci a less partimosious solution within the tot percentage of confidence bands around the standard solution is selected
#' @param nCores is the number of cores to be used
#' @param th is the size of the confidence interval. Default value is 1.96
#' @return an object of class RCVLasso containing the following objects:
#' \item{cv}{list with matrices (of sizes n.slipts x n.lambda) containing statistics for each lambda and each splitting.
#' mse is the matrix of predictive mse. R2 is the matrix of predictive R2 and active.size is the matrix with active beta in the trained model for each lambda and for each split.}
#' \item{lambda}{numeric vector of the lambda values taken in input}
#' \item{fitted}{predicted values}
#' \item{residuals}{differences between predicted and real values}
#' \item{intercept}{if the input parameter intercept is TRUE, it is the numeric value of the fitted intercep, otherwise is zero}
#' \item{beta}{beta coefficients of the fitted model}
#' \item{opt.lambda}{optimal lambda value}
#' \item{idx.support}{index of the optimal lambda value}
#' \item{pred.R2}{R2 of the optimal model on the test sets}
#' \item{pred.mse}{mse of the optimal model on the test sets}
#' \item{mse}{mse of the optimal model on the training sets}
#' \item{R2}{R2 of the optimal model on the training sets}
#' \item{measure}{measure used to compute the optimal solution}
#' \item{fitted_model}{model fitted on the whole data}
#' @keywords random split lasso
#' @export
RCVLassoPar  = function(y , x , lambda , n.splits = 25 , train.prop =  0.9,measure = 'MSE',
                        intercept = TRUE ,  type.solution = 'lci',nCores = 20,th = 1.96){
  cl = makeCluster(nCores)
  registerDoParallel(cl)

  n        <- length(y)
  ntrain   <- ceiling(train.prop * n)
  ntest    <- n-ntrain
  nlambda  <- length(lambda)

  print("starting foreach...")
  oper <- foreach(s=1:n.splits,.combine=comb,.multicombine=TRUE,
                  .init=list(c(), c(),c())) %dopar% {

                    CVmse <- CVR2   <- CVnpreds  <- rep(0, nlambda)

                    ## Sample training set
                    idx_train <- sample(1:n, size = ntrain, replace = FALSE)

                    ## Cycle over tuning parameters
                    for (l in 1:nlambda){
                      ## dummy initial containers
                      fit  <- NULL
                      yhat <- rep(NA, times = ntest)
                      RSS <- TSS <- 0
                      ## Try to perform the LASSO
                      try({
                        fit <- glmnet(y = y[idx_train], x = x[idx_train, ],
                                      lambda = lambda[l] , intercept = intercept)
                        fit$beta <- as.numeric(fit$beta)
                      }, silent = TRUE)


                      if(!is.null(fit)){

                        ## Select relavent predictors
                        idx_relevant  <- which(fit$beta !=0)
                        nRel <- length(idx_relevant)

                        CVnpreds[l] = nRel

                        if( nRel >= 2){
                          if (intercept){
                            if(fit$a0!=0){
                              yhat <- cbind(1,x[-idx_train, ]) %*%  c(fit$a0,   fit$beta)
                            }
                          }else{
                            yhat <- x[-idx_train, ] %*%  fit$beta
                          }
                          yhat <- as.numeric(yhat)
                          RSS <- sum( { y[-idx_train]  -  yhat }^2 )
                          TSS <- sum( { y[-idx_train]  -  mean(y[-idx_train]) }^2 )
                          CVmse[l] = RSS / ntest
                          r2 = 1 - RSS / TSS
                          if(r2<0){
                            CVR2[l] =  0
                          }else{
                            CVR2[l] =  r2
                          }
                        }else{
                          ## If none of the regressors is selected we consider the constant model
                          ## y = mean(y) + error
                          RSS         <- sum( { y[-idx_train]  -  mean(y[-idx_train]) }^2 )
                          CVmse[l] =  RSS / ntest
                          CVR2[l] = 0
                        }

                      } ## END: if(!is.null(fit))
                    }  ## ENDS for  for (l in 1:nlambda)

                    toRet =list(CVmse=CVmse,CVR2=CVR2,CVnpreds=CVnpreds,idx_train = idx_train)

                    return(toRet)
                  }## ENDS foreach (s in 1:n.splits)

  print("end foreach...")
  stopCluster(cl)

  CVmse = oper[[1]]
  CVR2 = oper[[2]]
  CVnpreds = oper[[3]]
  idx_train = oper[[4]]

  print("check measure...")

  ## Select the final lambda
  if(measure=='R2'){
    avg  <- apply(CVR2, 2, mean, na.rm=TRUE)
    se   <- apply(CVR2, 2, sd, na.rm=TRUE) / sqrt(n.splits)
    uci  <- avg +  th * se
    lci  <- avg - th * se
    if(type.solution=='min'){
      #  print('min')
      idx_best  <- which.max(avg)
    }else{
      #  print('ci')
      idx_star <- which.max(avg)
      idx_best <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))
    }
  }

  if(measure=='MSE'){
    #   print('mse')
    avg  <- apply(CVmse, 2, mean, na.rm=TRUE)
    se   <- apply(CVmse, 2, sd, na.rm=TRUE) / sqrt(n.splits)
    uci  <- avg + th * se
    lci  <- avg - th * se
    if(type.solution=='min'){
      #    print('min')
      idx_best  <- which.min(avg)
    }
    if(type.solution=="uci"){ #more features
      #   print('ci')
      idx_star <- which.min(avg)
      idx_best <- min(which(lambda <= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))
    }
    if(type.solution=="lci"){ #less features
      idx_star <- which.min(avg)
      idx_best <- max(which(lambda >= lambda[idx_star]   &  { avg <= uci[idx_star]   &  avg >= lci[idx_star] }))

    }

  }


  ## Fit the optimal lasso on the entire sample
  ##

  print("fit optimal lasso...")
  lstar         <- lambda[idx_best]
  print(paste("LStar:", lstar))
  fit           <- glmnet(y = y, x = x, lambda = lstar , intercept = intercept)
  fit$beta      <- as.numeric(fit$beta)
  idx_relevant  <- which(fit$beta !=0)

  # print(dim(CVnpreds))
  # print(dim(CVmse))
  # print(dim(CVmse))
  #
  #
  # print(n.splits)
  # print(nlambda)
  print("assigning idx relevant")

  CVnpreds[nrow(CVnpreds),nlambda] <- length(idx_relevant)
  print("assigned idx relevant")

  if( length(idx_relevant) >= 1){
    if (intercept){
      if(fit$a0!=0){
        yhat <- cbind(1,x) %*%  c(fit$a0,   fit$beta)
      }
    }else{
      yhat <- x %*%  fit$beta
    }
    yhat <- as.numeric(yhat)
    RSS  <- sum( { y  -  yhat }^2 )
    TSS  <- sum( { y  -  mean(y) }^2 )
    R2   <- 1 - (RSS / TSS)
  }else{
    ## If none of the regressors is selected we consider the constant model
    ## y = mean(y) + error
    yhat   <- rep(mean(y), n)
    RSS    <- sum( { y[-idx_train]  -  mean(y[-idx_train]) }^2 )
    R2     <- 0
  }

  ## Output
  ans             <- list()
  ans$cv          <- list(mse = CVmse, R2 = CVR2, active.size = CVnpreds)
  ans$lambda      <- lambda
  ans$fitted      <- yhat
  ans$residuals   <- as.numeric(y-yhat)
  if(intercept){
    ans$intercept  <- fit$a0
  }else{
    ans$intercept  <- 0
  }
  ans$beta        <- fit$beta
  ans$opt.lambda  <- lstar
  ans$idx.support <- idx_relevant
  #ans$pred.R2     <- avg[idx_best]
  ans$pred.R2     <- apply(CVR2, 2, mean, na.rm=TRUE)[idx_best]
  ans$pred.mse    <- apply(CVmse, 2, mean, na.rm=TRUE)[idx_best]
  ans$mse         <- RSS/n
  ans$R2          <- R2
  ans$measure = measure
  ans$fitted_model = fit

  class(ans) <- 'RCVLasso'

  return(ans)
}


plot_pred_real = function(pred_train, pred_test, train_class, test_class, xlab, ylab){
  min_x = min(pred_train,pred_test)
  max_x = max(pred_train,pred_test)
  min_y = min(train_class,test_class)
  max_y = max(train_class,test_class)
  train_tp = cbind(pred_train, train_class)
  plot(pred_train,train_class, xlim = c(min_y,max_y),ylim = c(min_y,max_y), xlab = xlab, ylab = ylab)
  points(pred_test,test_class,col="red")
  lines(x = c(min_y,max_y), y = c(min_y,max_y))
  #text(x = 1,y=-1,labels = paste("MSE: ",round(sum( { train_class  -  train_tp[,1] }^2 )/nrow(train_tp),2),sep=""))
  legend(x = "bottomright",legend = c("Train","Test"),fill = c("black","red"),bty = "n")
}
angy89/hyQSAR documentation built on Sept. 24, 2019, 7:31 a.m.