hyQSAR: Hybrid QSAR model

Documented in tqsar

#' Random split for glmnet
#'
#' Computes random split validation for glmnet, produces a plot, and returns a value for lambda
#' @import zeallot
#' @param X is the dataset matrix with samples on rows and features on columns.
#' @param y is the numeric vector of response variables.
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' It can be one of the following: "none","abs","abs.alpha.power","mul","log.abs","log.abs.alpha.power"; none is the default parameter.
#' @param alpha_values is a numeric vector of alpha value to be used in the abs.alpha.power transformation. Default value is c(0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2)
#' @param cost is a numeric constant used when type.transformation = mul.
#' @param bb is the base of the logarithm used when type.transformation = log.abs or log.abs.alpha.power
#' @param type.solution is a string indicating the type of solution to compute. Possible values are: min, uci and lci; if standard min or max of the average CV function.
#' if uci, take the the most parsimonous solution within the tot percentage of confidence bands around the standard solution. if lci a less partimosious solution within the tot percentage of confidence bands around the standard solution is selected
#' @param measure is the measure used to perform the choice of the optimal lambda value. Possible values are MSE and R2. Default value is MSE
#' @param intercept is a boolean valus indicating if we want to fit or not the intercept. Default valuw is TRUE
#' @param th is the size of the confidence interval. Default value is 1.96
#' @param cor_th is the maximum accepted correlation between couple of features. Default value is 0.95
#' @param zero_th is the maximum percentage of zeros accepted in a feature. Default value is 0.8
#' @param n.splits is the number of random split to be performed. Default value is 25
#' @param inner.train.prop is the percentage of samples from the train test to be used as training set in the random-split method. Default value is 0.9 (90)
#' @param nLambda number of lambda to be tested in the LASSO model
#' @param n.splits.val is the number of random split to compute validation metrics. Default value is 25
#' @param n.splits.scr is the number of random split to perform the y-scrambling test. Default value is 25
#' @param nCores is the number of cores to be used
#' @return an object of class tqsar containing a list of final models,
#' a list of williams plot object
#' a dataframe with all the metrices computed for the different models, and
#' a list of lambda values used to train the LASSO model.
#' \item{finalModels}{a list with one or more models coming from the RCVLasso function}
#' \item{williams_plots}{a list of one or more williams plot objects}
#' \item{Metrics}{a dataframe with all the internal and external metrics estimated for every model}
#' \item{lambda}{a list of one or more vector of lambda used to tune the LASSO models}
#' @keywords qsar
#' @export

tqsar = function(X_train,X_test,y_train,y_test,type.transformation = "none",alpha_values = c(0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2),
                 cost=3, bb = 2, type.solution="lci", measure="MSE", intercept= TRUE, th = 1.96,
                 cor_th=.95, zero_th=.8,
                 n.splits = 10,inner.train.prop=0.9, nLambda = 100,n.splits.val=25,n.splits.scr=25,
                 nCores = 10,md_list = c(),gene_list = c()){

  if(!type.transformation %in% c("none","abs","abs.alpha.power","mul","log.abs","log.abs.alpha.power"))
    stop("'type.transformation' must be one of the following: none,abs,abs.alpha.power,mul,log.abs,log.abs.alpha.power")

  if(!measure %in% c("MSE","R2"))
    stop("'measure' must be one of the following: 'MSE','R2'")

  if(!type.solution %in% c("uci","lci","min"))
    stop("'type.solution' must be one of the following: 'uci','lci','min'")

  if(cor_th < 0 | cor_th >1 )  stop("'cor_th' must be a number between 0 and 1")
  if(zero_th < 0 | zero_th >1 )  stop("'zero_th' must be a number between 0 and 1")
  #if(p_train < 0 | p_train >1 )  stop("'p_train' must be a number between 0 and 1")
  if(inner.train.prop < 0 | inner.train.prop >1 )  stop("'inner.train.prop' must be a number between 0 and 1")

  set.seed(1)

  # training/test split
  #c(X_train,X_test,y_train,y_test,xxx)  %<-% training_test_split(X,y,p_train)

  if(type.transformation == "none" | type.transformation == "abs" | type.transformation == "mul" | type.transformation == "log.abs"){

    X_train_t = transform_data(X_train,type.transformation, cost = cost,bb = bb)
    X_test_t = transform_data(X_test,type.transformation, cost = cost, bb=bb)

    L <- lambda.grid(y=y_train, x=as.matrix(X_train_t), nlambda = nLambda)

    u2f <- RCVLassoPar(y = y_train , x = as.matrix(X_train_t) , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                       intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)

    c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t), y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,nPermValm = n.splits.val,nPermScr = n.splits.scr)

    final_models = list(u2f=u2f)
    WP_list = list(WP=WP)
    lambda_list = list(L=L)
    y_pred_tr_list = list(y_pred_train=y_pred_train)
    y_pred_te_list = list(y_pred_test=y_pred_test)
    Metrics = Me
  }
  if(type.transformation == "abs.alpha.power" | type.transformation == "log.abs.alpha.power"){
    # Metrics = data.frame(nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
    #                      Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, rm2  = 0,
    #                      R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "mdlist",GElist = "gelist",Intercept = 0, coefficients = "coeff")
    Metrics = data.frame(nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
                         Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, #rm2  = 0,
                         rm2  = 0,drm2 = 0,trm1=0, k=0,trm2=0,k1=0,trm3=0,
                         R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = 0, coefficients = "")

    final_models = list()
    WP_list = list()
    lambda_list = list()
    y_pred_tr_list = list()
    y_pred_te_list = list()

    for(a in 1:length(alpha_values)){
      print(a)
      #X_train_t = transform_data(X,type.transformation, power = alpha_values[a], bb=bb)
      X_train_t = transform_data(X_train,type.transformation, power = alpha_values[a], bb=bb)
      X_test_t = transform_data(X_test,type.transformation, power = alpha_values[a], bb=bb)

      L <- lambda.grid(y=y_train, x=as.matrix(X_train_t), nlambda = nLambda)

      u2f <- RCVLassoPar(y = y_train , x = X_train_t , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                         intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)
      c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t), y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,nPermValm = n.splits.val,nPermScr = n.splits.scr)

      colnames(Me) = colnames(Metrics)
      Metrics = rbind(Metrics, Me)
      WP_list[[a]] = WP
      final_models[[a]] = u2f
      lambda_list[[a]] = L
      y_pred_tr_list[[a]] = y_pred_train
      y_pred_te_list[[a]] = y_pred_test

    } #end for alpha
    Metrics = Metrics[-1,]
  } #end if type.transformation == "abs.alpha.power"

  ## Output
  ans                    <- list()
  ans$finalModels        <- final_models
  ans$williams_plots     <- WP_list
  ans$Metrics            <- Metrics
  ans$lambda             <- lambda_list
  ans$pred_tr             <- y_pred_tr_list
  ans$pred_te             <- y_pred_te_list
  ans$X_train             <- X_train
  ans$X_test              <- X_test
  ans$y_train             <- y_train
  ans$y_test              <- y_test
  ans$test.idx            <- test.idx
  class(ans) <- 'tqsar'
  return(ans)
}