hyQSAR: Hybrid QSAR model

Documented in hqsar

#' Random split for glmnet
#'
#' Computes random split validation for glmnet, produces a plot, and returns a value for lambda
#' @import zeallot
#' @param X_md is the dataset matrix with samples on rows and MDs on columns.
#' @param X_ge is the dataset matrix with samples on rows and genes on columns.
#' @param y is the numeric vector of response variables.
#' @param type.transformation is a string indicating the type of transformation to be performed at the data before fitting the model.
#' It can be one of the following: "none","abs","abs.alpha.power","mul","log.abs","log.abs.alpha.power"; none is the default parameter.
#' @param alpha_values is a numeric vector of alpha value to be used in the abs.alpha.power transformation. Default value is c(0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2)
#' @param cost is a numeric constant used when type.transformation = mul.
#' @param bb is the base of the logarithm used when type.transformation = log.abs or log.abs.alpha.power
#' @param type.solution is a string indicating the type of solution to compute. Possible values are: min, uci and lci; if standard min or max of the average CV function.
#' if uci, take the the most parsimonous solution within the tot percentage of confidence bands around the standard solution. if lci a less partimosious solution within the tot percentage of confidence bands around the standard solution is selected
#' @param measure is the measure used to perform the choice of the optimal lambda value. Possible values are MSE and R2. Default value is MSE
#' @param intercept is a boolean valus indicating if we want to fit or not the intercept. Default valuw is TRUE
#' @param th is the size of the confidence interval. Default value is 1.96
#' @param n.splits is the number of random split to be performed. Default value is 25
#' @param inner.train.prop is the percentage of samples from the train test to be used as training set in the random-split method. Default value is 0.9 (90)
#' @param nLambda number of lambda to be tested in the LASSO model
#' @param n.splits.val is the number of random split to compute validation metrics. Default value is 25
#' @param n.splits.scr is the number of random split to perform the y-scrambling test. Default value is 25
#' @param nCores is the number of cores to be used
#' @return an object of class tqsar containing a list of final models,
#' a list of williams plot object
#' a dataframe with all the metrices computed for the different models, and
#' a list of lambda values used to train the LASSO model.
#' \item{finalModels}{a list with one or more models coming from the RCVLasso function}
#' \item{williams_plots}{a list of one or more williams plot objects}
#' \item{Metrics}{a dataframe with all the internal and external metrics estimated for every model}
#' \item{lambda}{a list of one or more vector of lambda used to tune the LASSO models}
#' @keywords qsar
#' @export

hqsar = function(X_md_train,X_md_test, X_ge_train, X_ge_test,y_train, y_test,
                 type.transformation.md = "none",type.transformation.ge = "none",
                 alpha = c(0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2),
                 alpha1 = c(0.1,0.25,0.5,0.75,1,1.25,1.5,1.75,2),
                 cost.md=3,cost.ge=3, bb.md = 2,bb.ge=2, type.solution="lci", measure="MSE", intercept= TRUE, th = 1.96,
                 n.splits = 99,inner.train.prop=0.9, nLambda = 100, n.splits.val=25, n.splits.scr=25,
                 nCores = 30){


  if(!type.transformation.md %in% c("none","abs","abs.alpha.power","log.abs","log.abs.alpha.power")) #mul
    stop("'type.transformation.md' must be one of the following: none,abs,abs.alpha.power,log.abs,log.abs.alpha.power")

  if(!type.transformation.ge %in% c("none","abs","abs.alpha.power","log.abs","log.abs.alpha.power")) #mul
    stop("'type.transformation.ge' must be one of the following: none,abs,abs.alpha.power,log.abs,log.abs.alpha.power")


  if(!measure %in% c("MSE","R2"))
    stop("'measure' must be one of the following: 'MSE','R2'")

  if(!type.solution %in% c("uci","lci","min"))
    stop("'type.solution' must be one of the following: 'uci','lci','min'")

  if(inner.train.prop < 0 | inner.train.prop >1 )  stop("'inner.train.prop' must be a number between 0 and 1")

  set.seed(1)

  md_list = colnames(X_md)
  gene_list = colnames(X_ge)

  if(type.transformation.ge == type.transformation.md){
    type.transformation = type.transformation.ge
    print("Apply Same Transformation")

    if(type.transformation == "none" | type.transformation == "abs" | type.transformation == "mul" | type.transformation == "log.abs"){

      X_train_t = cbind(transform_data(X_md_train,type.transformation.md, bb=bb.md,cost = cost.md),
                        transform_data(X_ge_train,type.transformation.ge, bb=bb.ge,cost = cost.ge))
      X_test_t = cbind(transform_data(X_md_test,type.transformation.md, bb=bb.md,cost = cost.md),
                       transform_data(X_ge_test,type.transformation.ge, bb=bb.ge,cost = cost.ge))


      L <- lambda.grid(y=y_train, x=as.matrix(X_train_t), nlambda = nLambda)
      u2f <- RCVLassoPar(y = y_train , x = as.matrix(X_train_t) , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                         intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)

      c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t), y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,nPermValm = n.splits.val,nPermScr = n.splits.scr)

      final_models = list(u2f=u2f)
      WP_list = list(WP=WP)
      lambda_list = list(L=L)
      y_pred_tr_list = list(y_pred_train=y_pred_train)
      y_pred_te_list = list(y_pred_test=y_pred_test)
      Metrics = Me
    }
    if(type.transformation == "abs.alpha.power" | type.transformation == "log.abs.alpha.power"){
      Metrics = data.frame(md_pow = 0, ge_pow = 0,nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
                           Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, #rm2  = 0,
                           rm2  = 0,dr2m = 0,trm1=0, k=0,trm2=0,k1=0,trm3=0,
                           R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = 0, coefficients = "")

      final_models = list()
      WP_list = list()
      lambda_list = list()
      y_pred_tr_list = list()
      y_pred_te_list = list()

      for(b in 1:length(alpha1)){
        print(b)
        for(a in 1:length(alpha)){
          print(a)
          X_train_t = cbind(transform_data(X_md_train,type.transformation.md, power = alpha1[b], bb=bb.md,cost = cost.md),
                            transform_data(X_ge_train,type.transformation.ge, power = alpha[a], bb=bb.ge,cost = cost.ge))
          X_test_t = cbind(transform_data(X_md_test,type.transformation.md, power = alpha1[b], bb=bb.md,cost = cost.md),
                           transform_data(X_ge_test,type.transformation.ge, power = alpha[a], bb=bb.ge,cost = cost.ge))

          L <- lambda.grid(y=y_train, x=X_train_t, nlambda = nLambda)

          u2f <- RCVLassoPar(y = y_train , x = X_train_t , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                             intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)
          c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t),
                                                                    y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,
                                                                    nPermValm = n.splits.val,nPermScr = n.splits.scr)

          i.pows = data.frame(md_pow=alpha1[b], ge_pow=alpha[a])
          Me = cbind(i.pows,Me)
          colnames(Metrics)  = colnames(Me)
          Metrics = rbind(Metrics, Me)
          WP_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = WP
          final_models[[paste(alpha1[b],"-",alpha[a],sep="")]] = u2f
          lambda_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = L
          y_pred_tr_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = y_pred_train
          y_pred_te_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = y_pred_test

        } #end for alpha
      }
      Metrics = Metrics[-1,]
    } #end if type.transformation == "abs.alpha.power"
  }else{
    print("Apply different Transformation")

    if(type.transformation.md %in% c("none","abs","mul","log.abs") & type.transformation.ge %in% c("none","abs","mul","log.abs")){
      print(paste("MD transformation",type.transformation.md))
      print(paste("GE transformation",type.transformation.ge))

      X_train_t = cbind(transform_data(X_md_train,type.transformation.md, bb=bb.md,cost = cost.md),
                        transform_data(X_ge_train,type.transformation.ge, bb=bb.ge,cost = cost.ge))
      X_test_t = cbind(transform_data(X_md_test,type.transformation.md, bb=bb.md,cost = cost.md),
                       transform_data(X_ge_test,type.transformation.ge, bb=bb.ge,cost = cost.ge))

      L <- lambda.grid(y=y_train, x=as.matrix(X_train_t), nlambda = nLambda)
      u2f <- RCVLassoPar(y = y_train , x = as.matrix(X_train_t) , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                         intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)

      c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t), y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,nPermValm = n.splits.val,nPermScr = n.splits.scr)

      final_models = list(u2f=u2f)
      WP_list = list(WP=WP)
      lambda_list = list(L=L)
      y_pred_tr_list = list(y_pred_train=y_pred_train)
      y_pred_te_list = list(y_pred_test=y_pred_test)
      Metrics = Me
    }
    if((type.transformation.md %in% c("none","abs","mul","log.abs") & type.transformation.ge %in% c("abs.alpha.power","log.abs.alpha.power")) |
       (type.transformation.ge %in% c("none","abs","mul","log.abs") & type.transformation.md %in% c("abs.alpha.power","log.abs.alpha.power"))){
      print(paste("MD transformation",type.transformation.md))
      print(paste("GE transformation",type.transformation.ge))

      x1 = c(ge = type.transformation.ge,md=type.transformation.md)
      xx1 = names(x1)[which(x1 %in% c("abs.alpha.power","log.abs.alpha.power"))]

      Metrics = data.frame(md_pow = 0, ge_pow = 0,nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
                           Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, #rm2  = 0,
                           rm2  = 0,dr2m = 0,trm1=0, k=0,trm2=0,k1=0,trm3=0,
                           R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = 0, coefficients = "")

      final_models = list()
      WP_list = list()
      lambda_list = list()
      y_pred_tr_list = list()
      y_pred_te_list = list()

      for(a in 1:length(alpha)){

        X_train_t = cbind(transform_data(X_md_train,type.transformation.md, bb=bb.md,cost = cost.md),
                          transform_data(X_ge_train,type.transformation.ge, bb=bb.ge,cost = cost.ge))
        X_test_t = cbind(transform_data(X_md_test,type.transformation.md, bb=bb.md,cost = cost.md),
                         transform_data(X_ge_test,type.transformation.ge, bb=bb.ge,cost = cost.ge))

        L <- lambda.grid(y=y_train, x=as.matrix(X_train_t), nlambda = nLambda)

        u2f <- RCVLassoPar(y = y_train , x = X_train_t , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                           intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)
        c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t),
                                                X_test = as.matrix(X_test_t), y_train, y_test, md_list=md_list,
                                                gene_list=gene_list,inner.train.prop,nPermValm = n.splits.val,nPermScr = n.splits.scr)

        if(xx1 == "ge"){
          i.pows = data.frame(md_pow="", ge_pow=alpha[a])
        }else{
          i.pows = data.frame(md_pow=alpha[a], ge_pow="")
        }

        Me = cbind(i.pows,Me)
        colnames(Metrics)  = colnames(Me)
        Metrics = rbind(Metrics, Me)
        WP_list[[a]] = WP
        final_models[[a]] = u2f
        lambda_list[[a]] = L
        y_pred_tr_list[[a]] = y_pred_train
        y_pred_te_list[[a]] = y_pred_test

      } #end for alpha
      Metrics = Metrics[-1,]
    }
    if(type.transformation.md %in% c("abs.alpha.power","log.abs.alpha.power") & type.transformation.ge %in% c("abs.alpha.power","log.abs.alpha.power")){
      print(paste("MD transformation",type.transformation.md))
      print(paste("GE transformation",type.transformation.ge))

      Metrics = data.frame(md_pow = 0, ge_pow = 0,nMD = 0, nGE = 0, MSE_tr = 0, MSE_te=0,R2_tr = 0,R2_te = 0,
                           Q2 = 0,CCC = 0,Q2F1 = 0,Q2F2 = 0, Q2F3 = 0, #rm2  = 0,
                           rm2  = 0,dr2m = 0,trm1=0, k=k,trm2=0,k1=0,trm3=0,
                           R2Yscr= 0,Q2Yscr= 0,AD_train= 0,AD_test = 0, MDlist = "", GElist = "",Intercept = 0, coefficients = "")

      final_models = list()
      WP_list = list()
      lambda_list = list()
      y_pred_tr_list = list()
      y_pred_te_list = list()

      for(b in 1:length(alpha1)){
        print(b)
        for(a in 1:length(alpha)){
          print(a)
          X_train_t = cbind(transform_data(X_md_train,type.transformation.md, power = alpha1[b], bb=bb.md,cost = cost.md),
                            transform_data(X_ge_train,type.transformation.ge, power = alpha[a], bb=bb.ge,cost = cost.ge))
          X_test_t = cbind(transform_data(X_md_test,type.transformation.md, power = alpha1[b], bb=bb.md,cost = cost.md),
                           transform_data(X_ge_test,type.transformation.ge, power = alpha[a], bb=bb.ge,cost = cost.ge))

          L <- lambda.grid(y=y_train, x=X_train_t, nlambda = nLambda)

          u2f <- RCVLassoPar(y = y_train , x = X_train_t , lambda = L, n.splits = n.splits  , train.prop =  inner.train.prop,
                             intercept = intercept ,  type.solution = type.solution,measure = measure,nCores = nCores,th = th)
          c(Me, WP,y_pred_test,y_pred_train)  %<-% evaluating_model(bm = u2f, X_train = as.matrix(X_train_t), X_test = as.matrix(X_test_t),
                                                                    y_train, y_test, md_list=md_list, gene_list=gene_list,inner.train.prop,
                                                                    nPermValm = n.splits.val,nPermScr = n.splits.scr)

          i.pows = data.frame(md_pow=alpha1[b], ge_pow=alpha[a])
          Me = cbind(i.pows,Me)
          colnames(Metrics)  = colnames(Me)
          Metrics = rbind(Metrics, Me)
          WP_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = WP
          final_models[[paste(alpha1[b],"-",alpha[a],sep="")]] = u2f
          lambda_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = L
          y_pred_tr_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = y_pred_train
          y_pred_te_list[[paste(alpha1[b],"-",alpha[a],sep="")]] = y_pred_test

        } #end for alpha
      }
      Metrics = Metrics[-1,]
    }

  }


  ## Output
  ans                    <- list()
  ans$finalModels        <- final_models
  ans$williams_plots     <- WP_list
  ans$Metrics            <- Metrics
  ans$lambda             <- lambda_list
  ans$pred_tr             <- y_pred_tr_list
  ans$pred_te             <- y_pred_te_list
  ans$X_train             <- X_train
  ans$X_test              <- X_test
  ans$y_train             <- y_train
  ans$y_test              <- y_test
  #ans$test.idx            <- test.index
  class(ans) <- 'hqsar'
  return(ans)
}