R/AIModels.R

Defines functions get_Models get_Model_params get_linearBayes get_Model use_linearBayes use_Model

Documented in get_linearBayes get_Model get_Model_params get_Models use_linearBayes use_Model

#' get a list of the implemented regression algorithms
#' @description Get a list of the implemented regression algorithms.
#'
#' @examples
#' list_models <- get_Models()
#' @return a list of the implemented regression algorithms with their nickname, as implemented in iqspr, and their common
#' name from literature.
#'
#' @export get_Models
get_Models <- function(){
  Models_list = c("linear_Bayes",
                  "elasticnet",
                  "svm",
                  "ranger",
                  "bagging",
                  "gradientboost")
  Models_list <- cbind(Models_list, c("Bayesian linear",
                                      "Elastic Net",
                                      "Support Vector Machines",
                                      "Random Forests",
                                      "Bagging",
                                      "Gradient Boosting"))
  Models_list <- cbind(Models_list, c("custom implementation",
                                      "glmnet",
                                      "kernlab",
                                      "ranger",
                                      "ranger",
                                      "xgboost"))

  Models_list <- cbind(Models_list, c("custom implementation",
                                      "cv.glmnet",
                                      "ksvm",
                                      "ranger",
                                      "ranger",
                                      "xgboost"))

  colnames(Models_list) <- c("nickname","name","package", "call")

  print(Models_list)
}

#' get a list of default parameters for a given regression algorithm
#' @description Get a list of default parameters for a given regression algorithm.
#'
#' @param model_name is the model's name for which you want to display the default parameters ("elasticnet" by default).
#' For the "svm" model, a.k.a Support Vector Machine, Gaussian RBF or Laplace kernel ("rbfdot" or "laplacedot" respectively)
#' are supported (limitation from the \code{\link[kernlab]{sigest}} function used as optimizer on the sigma parameter).
#'
#' @examples
#' list_params <- get_Model_params(model_name = "elasticnet")
#' @return a list of the default parameters for a given regression algorithm.
#'
#' @export get_Model_params
get_Model_params <- function(model_name = "elasticnet"){

  cat("\n")
  cat("Default parameters for the ", model_name, " regression model\n\n")
  if(model_name=="elasticnet"){
    params <- list("alpha" = 0.5)
  } else if(model_name=="svm"){
    params <- list("kernel" = "rbfdot",
                   "type" = "nu-svr",
                   "scaled" = T,
                   "C" = 50,
                   "cross" = 3)
  } else if(model_name=="ranger"){
    params <- list("num.trees" = 200)
  } else if(model_name=="bagging"){
    params <- list("num.trees" = 200)
  } else if(model_name=="gradientboost"){
    params <- list("params"= list("objective" = "reg:linear",
                   "eta" = 0.3,
                   "max_depth" = 6,
                   "min_child_weight" = 1,
                   "subsample" = 1,
                   "colsample_bytree" = 1),
                   "nrounds" = 10,
                   "nfold" = 5)
  } else if(model_name=="linear_Bayes"){
    params <- list("w0" = "matrix(0, ncol(X), ncol(Y))",
                   "V0_inv" = "diag(c(0,rep(1, ncol(X)-1)))",
                   "b0" = "numeric(ncol(Y))",
                   "a0" = "0")
  } else {
    params <- list("Caution" = "This model is not implemented. Please check the name, or type get_Models() for an exhaustive list of available models.")
  }

  print(params)
}


#' get the computation of parameters issued from the training of a Bayesian linear regression
#' @description Get the computation of parameters issued from the training of a Bayesian linear regression.
#' See the iqspr package paper for details concerning the definition of the cited parameters.
#' @param X is a matrix of features (e.g. fingerprints and/or physical descriptors) issued from a training set.
#' @param Y is a one-dimensional vector, or a matrix, of observables (e.g. targeted chemical properties) issued from a training
#' set.
#' @param params corresponds to the initialization of the parameters (see \code{\link{get_Model_params}} for a parameters
#' list) if known \emph{a priori} (NA by default).
#'
#' @examples
#' \dontrun{linearBayes_model <- get_linearBayes(X = X[training,], Y = Y[training,],
#' params = c(list(w0),list(V0_inv),list(a0),list(b0)))}
#' @return the latest w0, V0_inv, a0 and b0 parameters after training over the data X and Y.
#'
#' @export get_linearBayes
get_linearBayes <- function(X = NULL, Y = NULL, params = NA){
  if(length(params)==4){
    w0=params[["w0"]]
    V0_inv=params[["V0_inv"]]
    a0=params[["a0"]]
    b0=params[["b0"]]
    cat("Bayesian linear regression model is initialized\n")
  } else {
    w0 <- matrix(0, ncol(X), ncol(Y))
    V0_inv <- diag(c(0,rep(1, ncol(X)-1)))  # uninformative prior I (id matrix) with a 0 in [1,1] to avoid
    # regularization of the intercept
    b0 <- numeric(ncol(Y))
    a0 <- 0
    cat("Bayesian linear regression model sets by default\n")
  }

  cat("getting posterior distribution of parameters...\n")
  XX <- t(X)%*%X

  VnMat <- (V0_inv+XX)
  cholStatus <- try(cVnMat <- chol(VnMat), silent = TRUE)
  cholError <- ifelse(class(cholStatus) == "try-error", TRUE, FALSE)
  if(!cholError){
    TVn <- chol2inv(cVnMat) # twice faster than solve(V0_inv+XX)
  }else{
    TVn <- solve(VnMat) # return the inverse of (V0_inv+XX)
  }

  nw0 <- all(abs(w0) < .Machine$double.eps) # check if w0 is a zero matrix
  if(nw0){
    Twn <- TVn%*%(t(X)%*%Y)
  }else{
    Twn <- TVn%*%(V0_inv%*%w0+t(X)%*%Y)
  }

  TaN <- a0 + nrow(X)/2

  if(nw0){
    TbN <- b0 + diag(0.5*( (t(Y)%*%Y) - (t(Twn)%*%(V0_inv+XX)%*%Twn) ))
  }else{
    TbN <- b0 + diag(0.5*((t(w0) %*% V0_inv %*% w0) + (t(Y) %*% Y) - (t(Twn) %*% (V0_inv + XX) %*% Twn) ))
  }

  return(list(Twn,TVn,TaN,TbN))
}

#' construct a given regression model thanks to a training set
#' @description Construct a given regression model thanks to a training set.
#'
#' @param X is a matrix of features (e.g. fingerprints and/or physical descriptors) issued from a training set.
#' @param Y is a one-dimensional vector, or a matrix, of observables (e.g. targeted chemical properties) issued from a training
#' set.
#' @param model_name is the model's name to be built ("elasticnet" by default).
#' @param params is a list of fixed parameters applied to the model (see \code{\link{get_Model_params}} for a detailed view of the
#' default paramaters per model). For the "svm" model, a.k.a Support Vector Machine, Gaussian RBF or Laplace kernel
#' ("rbfdot" or "laplacedot" respectively) are supported (limitation from the \code{\link[kernlab]{sigest}} function used as optimizer on the sigma parameter).
#'
#' @examples
#' \dontrun{elasticNet_model <- get_Model(X = X[training,], Y = Y[training,],
#' model = "elasticnet", params = list("alpha" = 0.5))}
#' @return the desired regression model optimized.
#'
#' @import glmnet
#' @importFrom kernlab ksvm sigest
#' @import xgboost
#' @import ranger
#'
#' @export get_Model
get_Model <- function(X = NULL, Y = NULL, model_name = "elasticnet", params = NA){

  if(unique(is.na(params))){
    if(model_name=="elasticnet"){
      params <- list("alpha" = 0.5)
    } else if(model_name=="svm"){
      params <- list("kernel" = "rbfdot",
                     "type" = "nu-svr",
                     "scaled" = T,
                     "C" = 50,
                     "cross" = 3)
    } else if(model_name=="ranger"){
      params <- list("num.trees" = 200)
    } else if(model_name=="bagging"){
      params <- list("num.trees" = 200)
    } else if(model_name=="gradientboost"){
      params <- list("params"= list("objective" = "reg:linear",
                    "eta" = 0.3,
                    "max_depth" = 6,
                    "min_child_weight" = 1,
                    "subsample" = 1,
                    "colsample_bytree" = 1),
                    "nrounds" = 10,
                    "nfold" = 5)
    }
  } else {
    params = params
  }

  k <- numeric(0)
  if(model_name=="elasticnet"){
    # elastic-net algorithm
    rgr <- do.call(cv.glmnet, c(list(x = X, y = Y, family = "gaussian"),params))
    ##
  } else if (model_name=="svm"){
    # support vector machine algorithm
    k <- which(apply(X, 2, var) != 0)
    srange <- sigest(x = X[,k])[2]
    if(params$kernel=="rbfdot"||params$kernel=="laplacedot"){
      rgr <- do.call(ksvm, c(list(y = Y, x = X[,k], kpar = list(sigma = srange)),params))
    } else {
      rgr <- do.call(ksvm, c(list(y = Y, x = X[,k]),params))
    }
    ##
  } else if(model_name=="ranger"){
    # random forest algorithm for large amount of features and data (e.g. ranger package)
    data.train <- data.frame(X,Y,row.names = c(1:dim(X)[1]))
    colnames(data.train) <- c(colnames(X),"prop")
    rgr <- do.call(ranger, c(list(formula = prop~.,data=data.train),params))
    ##
  } else if(model_name=="bagging"){
    # random forest algorithm for large amount of features and data (e.g. ranger package)
    data.train <- data.frame(X,Y,row.names = c(1:dim(X)[1]))
    colnames(data.train) <- c(colnames(X),"prop")
    rgr <- do.call(ranger, c(list(formula = prop~.,data=data.train, mtry = ncol(X)),params))
    ##
  } else if(model_name=="gradientboost"){
    # xgboost algorithm
    k <- which(apply(X, 2, var) != 0)
    bst.cv <- do.call(xgb.cv,c(list(data = data.matrix(X[ , k]), label = Y, verbose = FALSE),params))
    nround <- which.min(bst.cv$evaluation_log$test_rmse_mean)
    rgr <- xgboost(params = params, data = data.matrix(X[ , k]), label = Y, nrounds = nround, verbose = FALSE)
    ##
  }

  return(list(rgr,params,k))
}

#' get the prediction from a Bayesian linear regression
#' @description Get the prediction from a Bayesian linear regression. For a prediction other than from a Bayesian linear
#' model, see \code{\link{use_Model}}.
#'
#' @param newx is a matrix of features (e.g. fingerprints and/or physical descriptors), issued from a test/validation set, for
#' which the properties (e.g. targeted chemical properties) are predicted.
#' @param model is the set of parameters defining a trained Bayesian linear regression model, issued from \code{\link{get_linearBayes}},
#' to be used as a predictor.
#'
#' @examples
#' \dontrun{linearBayes_pred <- use_linearBayes(newx = newx,
#' model = c(list(w0),list(V0_inv),list(a0),list(b0)))}
#'
#' @return the predicted properties with associated variance.
#'
#' @export use_linearBayes
use_linearBayes <- function(newx = NULL, model = NULL){

  Twn <- model[[1]]
  TVn <- model[[2]]
  TaN <- model[[3]]
  TbN <- model[[4]]

  predy <- t(Twn) %*% t(newx)
  temp1 <- TbN/TaN
  temp2 <- diag((1 + newx %*% TVn %*% t(newx)))
  predvar <- temp1 %*% t(temp2)

  return(list(predy,predvar))
}

#' get the prediction from a given model
#' @description Get the prediction from a desired regression model. For a Bayesian linear prediction, see \code{\link{use_linearBayes}}.
#'
#' @param newx is a matrix of features (e.g. fingerprints and/or physical descriptors), issued from a test/validation set, for
#' which the properties (e.g. targeted chemical properties) are predicted.
#' @param model_name is the model's name to be built ("elasticnet" by default).
#' @param model is the trained regression model, issued from \code{\link{get_Model}}, to be used as a predictor.
#'
#' @examples
#' \dontrun{elasticNet_pred <- use_Model(newx, model_name = "elasticnet", model = elasticnet_model)}
#'
#' @return the predicted properties.
#'
#' @import glmnet
#' @importFrom kernlab ksvm sigest
#' @import xgboost
#' @import ranger
#'
#' @importFrom stats predict
#' @export use_Model
use_Model <- function(newx = NULL, model_name = "elasticnet", model = NULL){

  params <- model[[2]]

  if(model_name=="elasticnet"){
    predy <- t(as.matrix(predict(model[[1]], s = "lambda.min", newx = as.matrix(newx))))
  } else if(model_name=="svm"){
    k <- model[[3]]
    predy <- t(as.matrix(predict(model[[1]], as.matrix(newx[,k]))))
  } else if(model_name=="ranger"){
    data.test <- data.frame(newx,row.names = c(1:dim(newx)[1]))
    colnames(data.test) <- colnames(newx)
    predy <- t(as.matrix(predict(model[[1]], dat = data.test)$predictions))
  } else if(model_name=="bagging"){
    data.test <- data.frame(newx,row.names = c(1:dim(newx)[1]))
    colnames(data.test) <- colnames(newx)
    predy <- t(as.matrix(predict(model[[1]], dat = data.test)$predictions))
  } else if(model_name=="gradientboost"){
    k <- model[[3]]
    predy <- t(as.matrix(predict(model[[1]], data.matrix(newx[,k]))))
  }

  return(predy)
}

Try the iqspr package in your browser

Any scripts or data that you put into this service are public.

iqspr documentation built on Aug. 1, 2017, 9:02 a.m.