R/regressors.R

Defines functions phenoregressor.BGLR.multikinships

Documented in phenoregressor.BGLR.multikinships

# Copyright 2017 Nelson Nazzicari
# This file is part of GROAN.
#
# GROAN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# GROAN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GROAN  If not, see <http://www.gnu.org/licenses/>.

########################################################################################
# In this file functions to do the actual regression on phenotypes after a proper      #
# training. The functions would probably be wrappers around existing                   #
# functions/libraries more than actualy implementation of mathematical models.         #
# The general form is:                                                                 #
#                                                                                      #
# phenoRegressor.algo = function(                                                      #
#           phenotypes, genotypes, extraCovariates, covariances, otherparams)          #
#                                                                                      #
# where:                                                                               #
# - algo            : name of the regressor                                            #
# - phenotypes      : phenotypes, a numeric array (n x 1), missing values are          #
#                     to be predicted                                                  #
# - genotypes       : SNP genotypes, one row per phenotype (n), one column per         #
#                     marker (m), values in 0/1/2 for diploids or 0/1/2/...ploidy for  #
#                     polyploids. Can be NULL if \code{covariances} is present.        #
# - covariances     : square matrix (n x n) of covariances. Can be NULL if genotypes   #
#                     is present.                                                      #
# - extraCovariates : extra covariates set, one row per phenotype (n), one column per  #
#                     covariate (w). If NULL no extra covariates are considered.       #
# - otherparams     : zero or more extra parameters necessary to specify the behaviour #
#                     of the regression algorithm                                      #
#                                                                                      #
# Checks on input (type, size) are not required, since these functions are supposed to #
# be used in a GROAN experiment, where checks are done when NoisyDataset and Workbench #
# are created.                                                                         #
#                                                                                      #
# The function should return a list with the following fields:                         #
# - 'predictions' : an array of (n x 1) of phenotypes with all NAs filled (predicted)  #
# - 'hyperparams' : (optional) named array of hyperparameters selected during          #
#                   training                                                           #
# - 'extradata'   : (optional) whatever extra data/objects are produced during         #
#                   training                                                           #
########################################################################################

#' Regression dummy function
#'
#' This function is for development purposes. It returns, as "predictions", an array of
#' random numbers. It accept the standard inputs and produces a formally correct output. It
#' is, obviously, quite fast.
#'
#' @param phenotypes phenotypes, numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param covariances square matrix (n x n) of covariances. Can be NULL if \code{genotypes} is present.
#' @param extraCovariates extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @return The function should return a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (k) predicted phenotypes
#'   \item \code{hyperparams} : named array of hyperparameters selected during training
#'   \item \code{extradata}   : any extra information
#' }
#'
#' @family phenoRegressors
#'
#' @examples #genotypes are not really investigated. Only
#' #number of test phenotypes is used.
#' phenoRegressor.dummy(
#'   phenotypes = c(1:10, NA, NA, NA),
#'   genotypes = matrix(nrow = 13, ncol=30)
#' )
#' @export
phenoRegressor.dummy = function (phenotypes, genotypes, covariances, extraCovariates){
  res = list(
    predictions = runif(length(phenotypes)),
    hyperparams = c(some = 1, params='a'),
    extradata = 'filler extra data for phenoRegressor.dummy'
  )
  return(res)
}

#' Regression using BGLR package
#'
#' This is a wrapper around \code{\link[BGLR]{BGLR}}. As such, it won't work if BGLR package
#' is not installed.\cr
#' Genotypes are modeled using the specified \code{type}. If \code{type} is 'RKHS' (and only
#' in this case) the covariance/kinship matrix \code{covariances} is required, and it will be modeled
#' as matrix K in BGLR terms. In all other cases genotypes and covariances are put in the model
#' as X matrices.\cr
#' Extra covariates, if present, are modeled as FIXED effects.
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param covariances square matrix (n x n) of covariances. Can be NULL if \code{genotypes} is present.
#' @param extraCovariates extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param type character literal, one of the following: FIXED (Flat prior), BRR (Gaussian prior),
#'             BL (Double-Exponential prior), BayesA (scaled-t prior),
#'             BayesB (two component mixture prior with a point of mass at zero and a scaled-t slab),
#'             BayesC (two component mixture prior with a point of mass at zero and a Gaussian slab)
#' @param ... extra parameters are passed to \code{\link[BGLR]{BGLR}}
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (n) predicted phenotypes, with NAs filled and all other positions repredicted (useful for calculating residuals)
#'   \item \code{hyperparams} : empty, returned for compatibility
#'   \item \code{extradata}   : list with information on trained model, coming from \code{\link[BGLR]{BGLR}}
#' }
#' @family phenoRegressors
#' @seealso \link[BGLR]{BGLR}
#' @export
#' @examples
#' \dontrun{
#' #using the GROAN.KI dataset, we regress on the dataset and predict the first ten phenotypes
#' phenos = GROAN.KI$yield
#' phenos[1:10]  = NA
#'
#' #calling the regressor with Bayesian Lasso
#' results = phenoRegressor.BGLR(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   covariances = NULL,
#'   extraCovariates = NULL,
#'   type = 'BL', nIter = 2000 #BGLR-specific parameters
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results$predictions,
#'      main = 'Train set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' points(GROAN.KI$yield[1:10], results$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' test.set.correlation  = cor(GROAN.KI$yield[1:10], results$predictions[1:10])
#' train.set.correlation = cor(GROAN.KI$yield[-(1:10)], results$predictions[-(1:10)])
#' writeLines(paste(
#'   'test-set correlation :', test.set.correlation,
#'   '\ntrain-set correlation:', train.set.correlation
#' ))
#' }
phenoRegressor.BGLR = function (phenotypes, genotypes, covariances, extraCovariates,
                                type = c('FIXED', 'BRR', 'BL', 'BayesA', 'BayesB', 'BayesC', 'RKHS'),
                                ...){
  #is BGLR installed?
  if (!requireNamespace("BGLR", quietly = TRUE)) {
    stop("BGLR package needed for this regressor to work. Please install it.",
         call. = FALSE)
  }

  #checking regression type in a supported list
  type = match.arg(type)

  #preparing the list to call the BGLR
  BGLR.args = list(...)
  BGLR.args$y = phenotypes

  #what type of ETA depends on the user's choice of model
  #in particular, RKHS needs a kinship/covariances, while all other can work on
  #either SNPs or covariances
  if (type == 'RKHS'){
    #this will fail if covariances are not present
    BGLR.args$ETA = list(list(K=covariances, model=type))
  }else{
    #if we have snps, we add the snps
    BGLR.args$ETA = list()
    if (!is.null(genotypes)){
      BGLR.args$ETA = list(list(X=genotypes, model=type))
    }
    #if we have kinship, we add kinship
    if (!is.null(covariances)){
      num = length(BGLR.args$ETA)
      BGLR.args$ETA[[num+1]] = list(X=covariances, model=type)
    }
  }

  #if we have extra covariates, we put them in as fixed effects
  if (!is.null(extraCovariates)){
    num = length(BGLR.args$ETA)
    BGLR.args$ETA[[num+1]] = list(X=extraCovariates, model='FIXED')
  }

  #since there is no way to NOT save the progress files
  #we create a tmpdir and then delete it at the end
  tmpdir = tempdir.create('BGLR')
  BGLR.args$saveAt = tmpdir

  #we should run with verbose=FALSE, but the user takes
  #precedence
  if (is.null(BGLR.args$verbose)){
    BGLR.args$verbose = FALSE
  }

  #ready to run
  trained.model = do.call(BGLR::BGLR, args = BGLR.args)

  #delete tempdir
  unlink(tmpdir, recursive = TRUE)

  #creating the result list
  return(list(
    predictions = trained.model$yHat,
    hyperparams = c(Note = 'No hyperparameters saved from BGLR, please see extra data'),
    extradata = trained.model
  ))
}


#' Multi-matrix GBLUP using BGLR
#'
#' This regressor implements Genomic BLUP using Bayesian methods from \link[BGLR]{BGLR} package,
#' but allows to use more than one covariance matrix.
#'
#' In its simplest form, GBLUP is defined as:
#' \deqn{y = 1\mu + Z u + e}
#' with
#' \deqn{var(y) =  K \sigma_u^2 + I\sigma_e^2}
#'
#' Where \eqn{\mu} is the overall mean, \eqn{K} is the incidence matrix
#' relating individual weights \eqn{u} to \eqn{y}, and \eqn{e} is a
#' vector of residuals with zero mean and covariance matrix \eqn{I\sigma_e^2}
#'
#' It is possible to extend the above model to include different types of
#' kinship matrices, each capturing different links between genotypes and phenotypes:
#'
#' \deqn{y = 1\mu + Z1 u1 + Z2 u2 + \dots + e}
#' with
#' \deqn{var(y) =  K1 \sigma_u1^2 + K2 \sigma_u2^2 + \dots + I\sigma_e^2}
#'
#' This function receives the first kinship matrix \eqn{K1} via the \code{covariances}
#' argument and an arbitrary number of extra matrices via the \code{extraCovariates}
#' built as follow:
#'
#'  \preformatted{#given the following defined variables
#'y = <some values, Nx1 array>
#'K1 = <NxN kinship matrix>
#'K2 = <another NxN kinship matrix>
#'K3 = <a third NxN kinship matrix>
#'
#'#invoking the multi kinship GBLUP
#'y_hat = phenoregressor.BGLR.multikinships(
#'   phenotypes = y,
#'   covariances = K1,
#'   extraCovariates = cbind(K2, K3)
#')
#' }
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes added for compatibility with the other GROAN regressors, must be NULL
#' @param covariances square matrix (n x n) of covariances.
#' @param extraCovariates the extra covariance matrices to be added in the GBLUP model,
#'                        collated in a single matrix-like structure, with optionally first column
#'                        as an ignored intercept (supported for compatibility). See details, below.
#' @param type character literal, one of the following: FIXED (Flat prior), BRR (Gaussian prior),
#'             BL (Double-Exponential prior), BayesA (scaled-t prior),
#'             BayesB (two component mixture prior with a point of mass at zero and a scaled-t slab),
#'             BayesC (two component mixture prior with a point of mass at zero and a Gaussian slab),
#'             RKHS (Gaussian processes, default)
#' @param ... extra parameters are passed to \code{\link[BGLR]{BGLR}}
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (n) predicted phenotypes, with NAs filled and all other positions repredicted (useful for calculating residuals)
#'   \item \code{hyperparams} : empty, returned for compatibility
#'   \item \code{extradata}   : list with information on trained model, coming from \code{\link[BGLR]{BGLR}}
#' }
#' @family phenoRegressors
#' @seealso \link[BGLR]{BGLR}
#' @export
phenoregressor.BGLR.multikinships = function(phenotypes, genotypes = NULL, covariances, extraCovariates, type = 'RKHS', ...){
  #is BGLR installed?
  if (!requireNamespace("BGLR", quietly = TRUE)) {
    stop("BGLR package needed for this regressor to work. Please install it.",
         call. = FALSE)
  }

  #checking that the user knows what they are doing
  if(!is.null(genotypes)){
    stop('genotypes should be NULL, this is a kinship-only regressor')
  }

  #preparing the list to call the BGLR
  BGLR.args = list(...)
  BGLR.args$y = phenotypes

  #always using RKHS
  BGLR.args$ETA = list()
  covariances = as.matrix(covariances)
  BGLR.args$ETA[[1]] = list(K=covariances, model=type)

  #do we have extra covariates?
  if (!is.null(extraCovariates)){
    n = length(phenotypes)
    columns = ncol(extraCovariates) #this should be in the form (n x m + 1) or (n x m)
    remainder = columns %% n
    if(remainder > 1){
      stop('Extracovariates should be a matrix sized [n x nm] or [n x (nm+1)] where n is
      the number of samples and m is the number of extra kinship matrices to be
      included in the model other than the one passed via covariances parameter.
      In case of [n x (nm+1)] the first column is considered intecept and ignored.')
    }
    extraCovariates = as.matrix(extraCovariates)

    #do extracovariates have an intercept as first column?
    if (remainder == 1){
      extraCovariates = extraCovariates[,-1]
    }

    #at this point we can compute the actual number of extra matrices
    m = ncol(extraCovariates) / n

    #each block of square matrix becomes a different entry in BGLR's ETA
    for (curr in 0:(m-1)){
      from = 1 + curr * n
      to = from + n - 1
      BGLR.args$ETA[[2 + curr]] = list(K=as.matrix(extraCovariates[,from:to]), model=type)
    }
  }

  #since there is no way to NOT save the progress files
  #we create a tmpdir and then delete it at the end
  tmpdir = tempfile()
  dir.create(tmpdir, showWarnings = FALSE, recursive = TRUE)
  BGLR.args$saveAt = tmpdir

  #we should run with verbose=FALSE, but the user takes
  #precedence
  if (is.null(BGLR.args$verbose)){
    BGLR.args$verbose = FALSE
  }

  #ready to run
  trained.model = do.call(BGLR::BGLR, args = BGLR.args)

  #delete tempdir
  unlink(tmpdir, recursive = TRUE)

  #creating the result list
  return(list(
    predictions = trained.model$yHat,
    hyperparams = c(Note = 'No hyperparameters saved from BGLR, please see extra data'),
    extradata = trained.model
  ))
}


#' SNP-BLUP or G-BLUP using rrBLUP package
#'
#' This is a wrapper around \code{rrBLUP} function \code{\link[rrBLUP]{mixed.solve}}.
#' It can either work with genotypes (in form of a SNP matrix) or with kinships (in form of a covariance
#' matrix). In the first case the function will implement a SNP-BLUP, in the second a G-BLUP. An error is
#' returned if both SNPs and covariance matrix are passed.\cr
#' In rrBLUP terms, genotypes are modeled as random effects (matrix Z), covariances as matrix K, and
#' extra covariates, if present, as fixed effects (matrix X).\cr
#' Please note that this function won't work if rrBLUP package is not installed.
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param covariances square matrix (n x n) of covariances.
#' @param extraCovariates optional extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param ... extra parameters are passed to rrBLUP::mixed.solve
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (k) predicted phenotypes
#'   \item \code{hyperparams} : named vector with the following keys: Vu, Ve, beta, LL
#'   \item \code{extradata}   : list with information on trained model, coming from \code{\link[rrBLUP]{mixed.solve}}
#' }
#' @family phenoRegressors
#' @seealso \link[rrBLUP]{mixed.solve}
#' @export
#' @examples
#' \dontrun{
#' #using the GROAN.KI dataset, we regress on the dataset and predict the first ten phenotypes
#' phenos = GROAN.KI$yield
#' phenos[1:10]  = NA
#'
#' #calling the regressor with ridge regression BLUP on SNPs and kinship
#' results.SNP.BLUP = phenoRegressor.rrBLUP(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   SE = TRUE, return.Hinv = TRUE #rrBLUP-specific parameters
#' )
#' results.G.BLUP = phenoRegressor.rrBLUP(
#'   phenotypes = phenos,
#'   covariances = GROAN.KI$kinship,
#'   SE = TRUE, return.Hinv = TRUE #rrBLUP-specific parameters
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results.SNP.BLUP$predictions,
#'      main = '[SNP-BLUP] Train set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' abline(a=0, b=1)
#' points(GROAN.KI$yield[1:10], results.SNP.BLUP$predictions[1:10], pch=16, col='red')
#'
#' plot(GROAN.KI$yield, results.G.BLUP$predictions,
#'      main = '[G-BLUP] Train set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' abline(a=0, b=1)
#' points(GROAN.KI$yield[1:10], results.G.BLUP$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' correlations = data.frame(
#'   model = 'SNP-BLUP',
#'   test_set_correlations = cor(GROAN.KI$yield[1:10], results.SNP.BLUP$predictions[1:10]),
#'   train_set_correlations = cor(GROAN.KI$yield[-(1:10)], results.SNP.BLUP$predictions[-(1:10)])
#' )
#' correlations = rbind(correlations, data.frame(
#'   model = 'G-BLUP',
#'   test_set_correlations = cor(GROAN.KI$yield[1:10], results.G.BLUP$predictions[1:10]),
#'   train_set_correlations = cor(GROAN.KI$yield[-(1:10)], results.G.BLUP$predictions[-(1:10)])
#' ))
#' print(correlations)
#' }
phenoRegressor.rrBLUP = function (phenotypes, genotypes=NULL, covariances=NULL, extraCovariates = NULL, ...){
  #is rrBLUP installed?
  if (!requireNamespace("rrBLUP", quietly = TRUE)) {
    stop("rrBLUP package needed for this regressor to work. Please install it.",
         call. = FALSE)
  }

  #we need either genotypes or covariances (but not both!)
  if (!xor(is.null(genotypes), is.null(covariances))){
    stop("rrBLUP regressor requires either genotypes (SNPs) or covariances (usually kinship matrices), and not both!",
         call. = FALSE)
  }

  #which function should I call?
  if (is.null(genotypes)){
    return(phenoRegressor.rrBLUP.G(phenotypes = phenotypes, covariances = covariances, extraCovariates = extraCovariates, ...))
  }else{
    return(phenoRegressor.rrBLUP.SNP(phenotypes, genotypes, extraCovariates, ...))
  }
}

#' SNP-BLUP using rrBLUP library
#'
#' Implementation of SNP-BLUP using rrBLUP library. Not to be exported.
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param extraCovariates optional extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param ... extra parameters are passed to rrBLUP::mixed.solve
#'
#' @keywords internal
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (k) predicted phenotypes
#'   \item \code{hyperparams} : named vector with the following keys: Vu, Ve, beta, LL
#'   \item \code{extradata}   : list with information on trained model, coming from \code{\link[rrBLUP]{mixed.solve}}
#' }
phenoRegressor.rrBLUP.SNP = function (phenotypes, genotypes, extraCovariates = NULL, ...){
  #a handy selector to separate test and train sets
  test = is.na(phenotypes)

  #training the model
  m =  rrBLUP::mixed.solve(y = phenotypes, Z = genotypes, X = extraCovariates, ...)

  #applying the model to test set to obtain predictions and obtaining:
  #m$u     marker effects
  #m$beta  fixed effects

  #since fixed effects are optional, we must compute their contribution aside
  if (is.null(extraCovariates)){
    fi = m$beta
  }else{
    fi = as.matrix(extraCovariates) %*% m$beta
  }

  #predicting test set
  preds = as.vector(fi) + (as.matrix(genotypes) %*% m$u)[,1]

  #creating the result list
  return(list(
    predictions = preds,
    hyperparams = c(Vu=m$Vu, Ve=m$Ve, beta=m$beta, LL=m$LL),
    extradata = m
  ))
}

#' G-BLUP using rrBLUP library
#'
#' This function implements G-BLUP using rrBLUP library. Not to be exported.
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param covariances square matrix (n x n) of covariances.
#' @param extraCovariates optional extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param ... extra parameters are passed to rrBLUP::mixed.solve
#'
#' @keywords internal
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (k) predicted phenotypes
#'   \item \code{hyperparams} : named vector with the following keys: Vu, Ve, beta, LL
#'   \item \code{extradata}   : list with information on trained model, coming from \code{\link[rrBLUP]{mixed.solve}}
#' }
phenoRegressor.rrBLUP.G = function (phenotypes, covariances, extraCovariates = NULL, ...){
  #extracting sample names
  samples = rownames(covariances)

  #we need to build a dataframe with all the required data
  df = data.frame(
    'phenotypes' = phenotypes,
    'samples' = samples
  )

  #should we add extraCovariates as fixed effects?
  if(!is.null(extraCovariates)){
    df = cbind(df, extraCovariates)
  }

  #we usually use GAUSS = FALSE to use passed kinship, but
  #the user takes precedence
  user.args = list(...)
  GAUSS.val = user.args$GAUSS
  if (is.null(GAUSS.val)){
    GAUSS.val = FALSE
  }

  #if covariances are not already a matrix, this is a good moment to change
  if (!is.matrix(covariances)){
    covariances = as.matrix(covariances)
  }

  #training the model
  m =  rrBLUP::kin.blup(
    data = df, geno = 'samples', pheno = 'phenotypes', fixed = colnames(extraCovariates),
    GAUSS = GAUSS.val, K = covariances)

  #as predictions, we use the predicted genetic values, averaged over the fixed effects
  preds = m$pred

  #creating the result list
  return(list(
    predictions = preds,
    hyperparams = c(Vg=m$Vg, Ve=m$Ve),
    extradata = m
  ))
}

#' Support Vector Regression using package e1071
#'
#' This is a wrapper around several functions from \code{e1071} package (as such, it won't work if
#' e1071 package is not installed).
#' This function implements Support Vector Regressions, meaning that the data points are projected in
#' a transformed higher dimensional space where linear regression is possible.\cr
#' \cr
#' \code{phenoRegressor.SVR} can operate in three modes: run, train and tune.\cr
#' In \strong{run} mode you need to pass the function an already tuned/trained SVR model, typically
#' obtained either directly from e1071 functions (e.g. from \link[e1071]{svm}, \link[e1071]{best.svm} and so forth)
#' or from a previous run of \code{phenoRegressor.SVR} in a different mode. The passed model is applied
#' to the passed dataset and predictions are returned.\cr
#' In \strong{train} mode a SVR model will be trained on the passed dataset using the passed hyper
#' parameters. The trained model will then be used for predictions.\cr
#' In \strong{tune} mode you need to pass one or more sets of hyperparameters. The best combination of
#' hyperparameters will be selected through crossvalidation. The best performing SVR model will be used
#' for final predictions. This mode can be very slow.\cr
#' \cr
#' There is no distinction between regular covariates (genotypes) and extra
#' covariates (fixed effects) in Support Vector Regression. If extra covariates are passed, they are
#' put together with genotypes, side by side. Same thing happens with covariances matrix. This
#' can bring to the scientifically questionable but technically correct situation of regressing
#' on a big matrix made of SNP genotypes, covariances and other covariates, all collated side by side.
#' The function makes no distinction, and it's up to the user understand what is correct in each
#' specific experiment.\cr
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param covariances square matrix (n x n) of covariances. Can be NULL if \code{genotypes} is present.
#' @param extraCovariates extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param mode this parameter decides what will happen with the passed dataset
#'             \itemize{
#'             \item \code{mode = "tune"} : hyperparameters will be tuned on a grid (you may want to specify
#'                                 its values using extra params) with a call to \code{e1071::tune.svm}. Use this option if
#'                                 you have no idea about the optimal choice of hyperparameters. This mode can be very slow.
#'             \item \code{mode = "train"} : an SVR will be trained on the train dataset using the passed hyperparameters
#'                                  (if you know them). This more invokes \code{e1071::train}
#'             \item \code{mode = "run"} : you already have a tuned and trained SVR (put it into \code{tuned.model}) and
#'                                want to use it. The fastest mode.
#'             }
#' @param tuned.model a tuned and trained SVR to be used for prediction. This object is only used if
#'                    \code{mode} is equal to "run".
#' @param scale.pheno if TRUE (default) the phenotypes will be scaled and centered (before tuning or before
#'                    applying the passed tuned model).
#' @param scale.geno if TRUE the genotypes will be scaled and centered (before tuning or before
#'                    applying the passed tuned model. It is usually not a good idea, since it leads to
#'                    worse results. Defaults to FALSE.
#' @param ... all extra parameters are passed to \code{e1071::svm} or \code{e1071::tune.svm}
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (n) predicted phenotypes
#'   \item \code{hyperparams} : named vector with the following keys: gamma, cost, coef0, nu, epsilon. Some
#'                              of the values may not make sense given the selected model, and will contain
#'                              default values from e1071 library.
#'   \item \code{extradata}   : depending on \code{mode} parameter, \code{extradata} will contain one of the
#'                              following:
#'                              1) a SVM object returned by e1071::tune.svm, containing both
#'                              the best performing model and the description of the training process
#'                              2) a newly trained SVR model
#'                              3) the same object passed as \code{tuned.model}
#' }
#' @family phenoRegressors
#' @seealso \link[e1071]{svm}, \link[e1071]{tune.svm}, \link[e1071]{best.svm} from e1071 package
#' @export
#' @examples \dontrun{
#' ### WARNING ###
#' #The 'tuning' part of the example can take quite some time to run,
#' #depending on the computational power.
#'
#' #using the GROAN.KI dataset, we regress on the dataset and predict the first ten phenotypes
#' phenos = GROAN.KI$yield
#' phenos[1:10] = NA
#'
#' #--------- TUNE ---------
#' #tuning the SVR on a grid of hyperparameters
#' results.tune = phenoRegressor.SVR(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   covariances = NULL,
#'   extraCovariates = NULL,
#'   mode = 'tune',
#'   kernel = 'linear', cost = 10^(-3:+3) #SVR-specific parameters
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results.tune$predictions,
#'      main = 'Mode = TUNING\nTrain set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' points(GROAN.KI$yield[1:10], results.tune$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' test.set.correlation  = cor(GROAN.KI$yield[1:10], results.tune$predictions[1:10])
#' train.set.correlation = cor(GROAN.KI$yield[-(1:10)], results.tune$predictions[-(1:10)])
#' writeLines(paste(
#'   'test-set correlation :', test.set.correlation,
#'   '\ntrain-set correlation:', train.set.correlation
#' ))
#'
#' #--------- TRAIN ---------
#' #training the SVR, hyperparameters are given
#' results.train = phenoRegressor.SVR(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   covariances = NULL,
#'   extraCovariates = NULL,
#'   mode = 'train',
#'   kernel = 'linear', cost = 0.01 #SVR-specific parameters
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results.train$predictions,
#'      main = 'Mode = TRAIN\nTrain set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' points(GROAN.KI$yield[1:10], results.train$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' test.set.correlation  = cor(GROAN.KI$yield[1:10], results.train$predictions[1:10])
#' train.set.correlation = cor(GROAN.KI$yield[-(1:10)], results.train$predictions[-(1:10)])
#' writeLines(paste(
#'   'test-set correlation :', test.set.correlation,
#'   '\ntrain-set correlation:', train.set.correlation
#' ))
#'
#' #--------- RUN ---------
#' #we recover the trained model from previous run, predictions will be exactly the same
#' results.run = phenoRegressor.SVR(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   covariances = NULL,
#'   extraCovariates = NULL,
#'   mode = 'run',
#'   tuned.model = results.train$extradata
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results.run$predictions,
#'      main = 'Mode = RUN\nTrain set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' points(GROAN.KI$yield[1:10], results.run$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' test.set.correlation  = cor(GROAN.KI$yield[1:10], results.run$predictions[1:10])
#' train.set.correlation = cor(GROAN.KI$yield[-(1:10)], results.run$predictions[-(1:10)])
#' writeLines(paste(
#'   'test-set correlation :', test.set.correlation,
#'   '\ntrain-set correlation:', train.set.correlation
#' ))
#' }
phenoRegressor.SVR = function (phenotypes, genotypes, covariances, extraCovariates, mode=c('tune', 'train', 'run'), tuned.model=NULL, scale.pheno=TRUE, scale.geno=FALSE, ...){
  #is e1071 installed?
  if (!requireNamespace("e1071", quietly = TRUE)) {
    stop("e1071 package needed for this regressor to work. Please install it.",
         call. = FALSE)
  }

  #ensuring the mode is ok
  mode = match.arg(mode)

  #ensuring mode coherency
  if (mode == 'run' & is.null(tuned.model)){
    stop("No tuned.model specified, but mode is 'run'",
         call. = FALSE)
  }

  #are scaling signal booleans?
  if (!is.boolean(scale.pheno)) stop("Parameter scale.pheno should be boolean", call. = FALSE)
  if (!is.boolean(scale.geno))  stop("Parameter scale.geno should be boolean", call. = FALSE)

  #if we get to this point, we can proceed

  #covariances and extra covariates are put together with genotypes
  if (is.null(genotypes)){
    genotypes = covariances
  }else{
    if (!is.null(covariances)){
      genotypes = cbind(genotypes, covariances)
    }
  }
  if (!is.null(extraCovariates)){
    genotypes = cbind(genotypes, extraCovariates)
  }

  #tuning, training or recycling?
  if (mode == 'run'){
    #the user passed an already tuned model, let's use it!
    model = tuned.model
    extra = tuned.model
  }else{
    #building a dataframe usable by formula interface
    df.train = data.frame(y_to_by_predicted = phenotypes, genotypes)

    #scaling (array of boolean, first position describes phenotypes, other
    #position describe genotypes)
    scale.signal = rep(scale.geno, ncol(df.train))
    scale.signal[1] = scale.pheno

    #tuning or training?
    if (mode == 'tune'){
      #for tuning, to avoid e1071 to generate warnings, we remove lines containing NAs
      train.set = !is.na(phenotypes)

      training.result = e1071::tune.svm(
        y_to_by_predicted ~ .,
        data=df.train[train.set,],
        scale=scale.signal,
        ...)
      #storing results in standard named vars
      model = training.result$best.model
    }else{
      training.result = e1071::svm(
        y_to_by_predicted ~ .,
        data=df.train,
        scale=scale.signal,
        ...)
      #storing results in standard named vars
      model = training.result
    }
    extra = training.result
  }

  #retrieving kernel name
  knames = c('linear', 'polynomial', 'radial', 'sigmoid')

  #extracting hyperparams (some assume default
  #value and are not used)
  hyp = c(
    'mode' = mode,
    'kernel' = knames[model$kernel + 1],
    'gamma' = model$gamma,
    'cost' = model$cost,
    'coef0' = model$coef0,
    'nu' = model$nu,
    'epsilon' = model$epsilon
  )

  #returning predictions and stuff
  return(list(
    predictions = predict(object=model, newdata=genotypes),
    hyperparams = hyp,
    extradata = extra
  ))
}

#' Random Forest Regression using package randomForest
#'
#' This is a wrapper around \link[randomForest]{randomForest} and related functions.
#' As such, this function will not work if randomForest package is not installed.
#' There is no distinction between regular covariates (genotypes) and extra
#' covariates (fixed effects) in random forest. If extra covariates are passed, they are
#' put together with genotypes, side by side. Same thing happens with covariances matrix. This
#' can bring to the scientifically questionable but technically correct situation of regressing
#' on a big matrix made of SNP genotypes, covariances and other covariates, all collated side by side.
#' The function makes no distinction, and it's up to the user understand what is correct in each
#' specific experiment.\cr
#' \cr
#' \strong{WARNING}: this function can be *very* slow, especially when called on thousands of SNPs.
#'
#' @param phenotypes phenotypes, a numeric array (n x 1), missing values are predicted
#' @param genotypes SNP genotypes, one row per phenotype (n), one column per marker (m), values in 0/1/2 for
#'                  diploids or 0/1/2/...ploidy for polyploids. Can be NULL if \code{covariances} is present.
#' @param covariances square matrix (n x n) of covariances. Can be NULL if \code{genotypes} is present.
#' @param extraCovariates extra covariates set, one row per phenotype (n), one column per covariate (w).
#'                 If NULL no extra covariates are considered.
#' @param ntree number of trees to grow, defaults to a fifth of the number of samples (rounded
#'              up). As per \code{randomForest} documentation, it should not be set to too
#'              small a number, to ensure that every input row gets predicted at least a few times
#' @param ... any extra parameter is passed to \code{randomForest::randomForest()}
#'
#' @return The function returns a list with the following fields:
#' \itemize{
#'   \item \code{predictions} : an array of (k) predicted phenotypes
#'   \item \code{hyperparams} : named vector with the following keys: ntree (number of grown trees)
#'                              and mtry (number of variables randomly sampled as candidates at each split)
#'   \item \code{extradata}   : the object returned by \code{randomForest::randomForest()}, containing the
#'                              full trained forest and the used parameters
#' }
#' @family phenoRegressors
#' @seealso \link[randomForest]{randomForest}
#' @export
#' @examples
#' \dontrun{
#' #using the GROAN.KI dataset, we regress on the dataset and predict the first ten phenotypes
#' phenos = GROAN.KI$yield
#' phenos[1:10]  = NA
#'
#' #calling the regressor with random forest
#' results = phenoRegressor.RFR(
#'   phenotypes = phenos,
#'   genotypes = GROAN.KI$SNPs,
#'   covariances = NULL,
#'   extraCovariates = NULL,
#'   ntree = 20,
#'   mtry = 200 #randomForest-specific parameters
#' )
#'
#' #examining the predictions
#' plot(GROAN.KI$yield, results$predictions,
#'      main = 'Train set (black) and test set (red) regressions',
#'      xlab = 'Original phenotypes', ylab = 'Predicted phenotypes')
#' points(GROAN.KI$yield[1:10], results$predictions[1:10], pch=16, col='red')
#'
#' #printing correlations
#' test.set.correlation  = cor(GROAN.KI$yield[1:10], results$predictions[1:10])
#' train.set.correlation = cor(GROAN.KI$yield[-(1:10)], results$predictions[-(1:10)])
#' writeLines(paste(
#'   'test-set correlation :', test.set.correlation,
#'   '\ntrain-set correlation:', train.set.correlation
#' ))
#' }
phenoRegressor.RFR = function (
  phenotypes, genotypes, covariances, extraCovariates,
  ntree=ceiling(length(phenotypes)/5), ...){
  #is randomForest installed?
  if (!requireNamespace("randomForest", quietly = TRUE)) {
    stop("randomForest package needed for this regressor to work. Please install it.",
         call. = FALSE)
  }

  #covariances and extra covariates are put together with genotypes
  if (is.null(genotypes)){
    genotypes = covariances
  }else{
    if (!is.null(covariances)){
      genotypes = cbind(genotypes, covariances)
    }
  }
  if (!is.null(extraCovariates)){
    genotypes = cbind(genotypes, extraCovariates)
  }

  #building a dataframe usable by formula interface
  df.train = data.frame(y_to_by_predicted = phenotypes, genotypes)

  #training the random forest
  RF.trained_model = randomForest::randomForest(y_to_by_predicted ~ ., data=df.train, ntree=ntree, na.action=na.omit, ...)

  #extracting hyperparams
  hyp = c(
    'ntree' = RF.trained_model$ntree,
    'mtry' = RF.trained_model$mtry
  )

  #returning predictions and stuff
  return(list(
    predictions = predict(object=RF.trained_model, newdata=genotypes),
    hyperparams = hyp,
    #in extradata we just put the trained model
    extradata = RF.trained_model
  ))
}

Try the GROAN package in your browser

Any scripts or data that you put into this service are public.

GROAN documentation built on Nov. 28, 2022, 5:07 p.m.