R/fdaML_train.R

Defines functions fdaML_train

Documented in fdaML_train

#' @title Train a Machine Learning model
#'
#' @description Train a machine learning model.
#'
#' @param X A numeric functional predictor matrix of size \code{N*P}, where \code{N} is
#' the number of observations (spectra) and \code{P} is the number of points
#' (wavelengths) at which spectra are measured.
#'
#' @param y A numeric or factor response vector of size \code{N}, where \code{N} is the number of
#' observations (spectra).
#'
#' @param Z A numeric non-functional predictor matrix of size \code{N*S}, where \code{N}
#' is the number of observations (spectra) and \code{S} is the number of non-functional predictors
#' after bining/one-hot-encoding has taken place.
#'
#' @param task Regression (\code{"regr"}) for continuous response problems or Classification
#' (\code{"clas"}) for categorical response problems.
#'
#' @param model Currently Linear Model (\code{"lm"}) for Regression or Generalised Linear Model
#' (\code{"glm"}) for Classification.
#'
#' @param reduction Partial Least Squares (\code{"pls"}), Principal Component Analysis
#' (\code{"pca"}), or no further dimension reduction (\code{"n"}).
#'
#' @param smooth_w A numeric vector of length equal to \code{length(t_tange)} with weights for
#' smoothing spectra. If spectra smoothign is desired, this vector has to be specified. For
#' unweighted smoothing, choose all elements the same (e.g., \code{rep(1,length(t_range))}).
#'
#' @param balanced Whether the dataset should be balanced (\code{TRUE}) or not (\code{FALSE}).
#' If \code{TRUE}, observations are discarded so that the number of observations for each
#' level of the response variable is approximately the same. It applies to both Classification
#' and Regression with integer-valued response.
#'
#' @param reps Number of randomisations of the training/validating/testing subsets to average
#' over in cross-validation.
#'
#' @param Q_vec Vector of numbers of PCA/PLS components to be tried in cross-validation.
#' The dedault is a vector of evenly spaced values (approximately, due to rounding) between
#' \code{2} and \code{min(80, N_train-1)}, where \code{N_train} is the number of observations
#' in the training subset.
#'
#' @param Q_len Length of \code{Q_vec} when the later is not supplied. The default is 30.
#'
#' @param Q_opt Optimal number of PCA/PLS components. If this is supplied, cross-validation for
#' the number of PCA/PLS components is bypassed.
#'
#' @param tau_Q_opt Threshold for choosing the optimal parameters.
#' If \code{tau_Q_opt=0}, then \code{Q} is the value that minimises RMSD (for Regression)
#' or maximises AUC (for Classification).
#' If \code{tau_Q_opt>0}, then \code{Q} is the smallest value which gives a RMSD/AUC within
#' a margin \code{tau_Q_opt} of the optimal RMSD/AUC.
#'
#' @param lam_cv_type Cross-validation strategy to be used when choosing the penalty
#' parameter \code{lambda}: ordinary cross-validation (\code{"ocv"}), generalised
#' cross-validation (\code{"gcv"}) or no penalisation (\code{"n"}).
#'
#' @param lam_vec Vector of penalty parameters to be tried in cross-validation. The
#' default is a set of 10 values between 0.001 and 20 on an exponential scale.
#'
#' @param split_size Either a vector of length 3 specifying the proportion of observations to be
#' assigned to the training, validation and testing subsets, in this order; or a scalar
#' specifying the proportion of observations to be assigned to the training subset, in which
#' case the validation and testing subsets are assigned a proportion \code{(1-split_size)/2}
#' of observations each.
#'
#' @param intercept Whether to include a model intercept (\code{TRUE}) or not (\code{FALSE}).
#'
#' @param estimation_w A numeric vector of length equal to \code{length(y)} with weights for
#' coeficient function estimation.
#'
#' @param bspline_dim The dimension of the cubic B-spline functional representation system.
#'
#' @param t_range A numeric vector giving the wavelenghts at which spectra were measured.
#'
#' @param verbose Whether to print a progress bar (\code{TRUE}) or not (\code{FALSE}).
#'
#' @param ll A list whose named elements are the parameters for this function.Provide either
#' the function parameters as usual, or this list, but not both.
#'
#' @details
#'
#' @return An object of class \code{fdaModel}, which is a list containing the trained model.
#'
#' @references
#' P.M. Esperança, Thomas S. Churcher (2019). "Machine learning based
#' epidemiological vector control monitoring using functional data analysis
#' techniques for near-infrared spectral data".
#' arXiv.
#'
#' P.T. Reiss, R.T. Ogden (2007). "Functional Principal Component Regression
#' and Functional Partial Least Squares".
#' Journal of the American Statistical Association, 102(479), 984-996
#'
#' @seealso
#' \code{\link{fdaML_predict}}
#'
#' @examples
#'
#' @export
#'
fdaML_train <- function(X, y, Z=NULL, task, model=NULL, reduction, intercept=TRUE, smooth_w=NULL, balanced=FALSE, reps=100, Q_vec=NULL, Q_len=NULL, Q_opt=NULL, tau_Q_opt=0.0, lam_cv_type="n", lam_vec=NULL, split_size=c(0.5,0.25,0.25), estimation_w=NULL, bspline_dim=ncol(X), t_range=350:2500, verbose=TRUE, ll=NULL){


  # BASIC ENTRY CHECKS

  if(!is.null(ll)){
    if(nargs() > 1){
      stop("Provide the function parameters either as usual with named arguments or with a named list object 'll', but not both.")
    }
    {
      X <- ll$X
      y <- ll$y
      Z <- ll$Z
      task <- ll$task
      model <- ll$model
      reduction <- ll$reduction
      smooth_w <- ll$smooth_w
      intercept <- ll$intercept
      balanced <- ll$balanced
      lam_cv_type <- ll$lam_cv_type
      lam_vec <- ll$lam_vec
      reps <- ll$reps
      Q_len<-ll$Q_len
      Q_vec <- ll$Q_vec
      Q_opt <- ll$Q_opt
      split_size <- ll$split_size
      tau_Q_opt <- ll$tau_Q_opt
      estimation_w <- ll$estimation_w
      bspline_dim <- ll$bspline_dim
      t_range <- ll$t_range
      verbose <- ll$verbose
    }
  }

  if( missing(X) )
    stop("Input 'X' is missing with no default.")

  if( is.list(X) )
    stop("Input 'X' must be a matrix, not a list.")

  if( missing(y) )
    stop("Input 'y' is missing with no default.")

  if( is.list(y) )
    stop("Input 'y' must be a vector, not a matrix/list.")

  if( is.factor(y) ){
    if( any(table(y) <= 1) ){
      stop("Input 'y' (a factor) contains 0 or 1 observations of at least one factor level.") }}

  if( is.list(Z) )
    stop("Input 'Z' must be a numeric matrix, not a list. Categorical variables must be pre-processed using one-hot-encoding.")

  if( missing(task) ){
    stop("Input 'task' is missing with no default.")
  }else if( !(task %in% c("regr","clas")) ){
    stop("Input 'task' must be one of {'regr','clas'}.") }

  if( is.null(model) ){
    if( task == "regr" ){
      model <- "lm"
    }else if( task == "clas" ){
      model <- "glm" }
  }else{
    if( task == "regr" & !(model  %in% c("lm")) ){
      stop("Input 'model' must be one of {'lm'} for regression tasks.")
    }else if( task == "clas" & !(model  %in% c("glm")) ){
      stop("Input 'model' must be one of {'glm'} for classification tasks.") }}

  if( missing(reduction) ){
    stop("Input 'reduction' is missing with no default.")
  }else if( !(reduction %in% c("n", "pca", "pls")) ){
    stop("Input 'reduction' must be one of {'pls','pca','n'}.") }

  if( !is.logical(intercept) )
    stop("Input 'intercept' must be logical (TRUE or FALSE).")

  if( !missing(smooth_w) & !is.null(smooth_w) ){
    if( !(is.vector(smooth_w) & is.numeric(smooth_w)) )
      stop("Input 'smooth_w' must be a numeric vector.")
    if( any(smooth_w < 0) | any(smooth_w %% 1 != 0) )
      stop("All elements of input 'smooth_w' must be non-negative integer/whole number.")
    if( length(smooth_w) != length(t_range) )
      stop("Input parameter 'smooth_w' must be a vector of length equal to length(t_range).")
    smooth_spectra = TRUE
  }else{
    smooth_spectra= FALSE }

  if( !is.logical(balanced) )
    stop("Input 'balanced' must be logical (TRUE or FALSE).")
  if( balanced == TRUE & any(y %% 1 != 0) )
    stop("Input 'balanced' can not be TRUE when the response is not factor or integer valued.")

  if( !missing(reps) ){
    if( !(is.vector(reps) & is.numeric(reps) & length(reps)==1) )
      stop("Input 'reps' must be a numeric scalar.")
    if( reps < 0 | reps %% 1 != 0 )
      stop("Input 'reps' must be a non-negative integer/whole number.") }

  if( !(lam_cv_type %in% c("ocv", "gcv", "n")) )
    stop("Input 'lam_cv_type' must be one of {'ocv','gcv','n'}.")

  if( !is.null(lam_vec) ){
    if( !(is.vector(lam_vec) & is.numeric(lam_vec)) )
      stop("All elements of input 'lam_vec' must be a numeric vector.")
    if( any(lam_vec < 0) )
      stop("All elements of input 'lam_vec' must be non-negative.") }

  if( !missing(split_size) ){
    if( !(is.vector(split_size) & is.numeric(split_size)) ){
      stop("Input 'split_size' must be a numeric vector.") }
    if( !(all(split_size >= 0.05 ) & all(split_size <= 0.95 )) ){
      stop("All elements of input 'split_size' must be between 0.05 and 0.95.") }
    if( length(split_size) == 1 ){
      train_size <- split_size
      valid_size <- test_size <- train_size/2
    }else if( length(split_size) == 3 ){
      if( sum(split_size) == 1 ){
        train_size <- split_size[1]
        valid_size <- split_size[2]
        test_size  <- split_size[3]
      }else{
        stop("Elements of vector input 'split_size' must sum to 1.") }
    }else{
        stop("Input 'split_size' must be a vector of length 1 or 3.") }}

  if( !missing(estimation_w) & !is.null(estimation_w) ){
    stop("Weighted Estimation not implemented yet.") # delete this line once weighted estimation is implemented
    if( !(is.vector(estimation_w) & is.numeric(estimation_w)) )
      stop("Input 'estimation_w' must be a numeric vector.")
    if( any(estimation_w < 0) | any(estimation_w %% 1 != 0) )
      stop("All elements of input 'estimation_w' must be non-negative integer/whole number.")
    if( length(estimation_w) != length(y) )
      stop("Input parameter 'estimation_w' must be a vector of length equal to length(y).")
    weighted_estimation <- TRUE
  }else{
    weighted_estimation <- FALSE }

  if( !missing(bspline_dim) ){
    if( !(is.vector(bspline_dim) & is.numeric(bspline_dim) & length(bspline_dim)==1) )
      stop("Input 'bspline_dim' must be a numeric scalar")
    if( bspline_dim < 1 | bspline_dim > ncol(X) )
      stop("Input 'bspline_dim' must be greater than 1 and smaller than ncol(X).")
    if( bspline_dim %% 1 != 0 )
      stop("Input 'bspline_dim' must be an integer/whole number.") }

  if( task == "clas" & lam_cv_type == "gcv" ){
    glmcv <- TRUE
  }else{
    glmcv <- FALSE }

  if( !missing(t_range) ){
    if( !(is.vector(t_range) & is.numeric(t_range)) ){
      stop("Input 't_range' must be a numeric vector.") }
    if( length(t_range) != ncol(X) ){
      stop("Input 't_range' must be a vector of length equal to ncol(X).") }
    if( !all(t_range %% 1 == 0) ){
      stop("All elements of input 't_range' must be integer/whole numbers.") }
  }


  # FORCE UNIFORM/BALANCED RESPONSE BY DISCARDING OBSERVATIONS FROM THE MORE ABUNDANT CLASSES/GROUPS
  lvls <- sort(unique(y)); nlvls <- length(lvls)
  if(balanced){
    min_abundance <- min(sapply(1:nlvls, function(z){ sum(y==lvls[z]) }))
    each_loc <- lapply(1:nlvls, function(z){ which(y==lvls[z]) })
    smplId <- unlist(lapply(1:nlvls, function(z){ sample(each_loc[[z]], min_abundance, replace=F) }))
    X <- X[smplId,]
    y <- y[smplId] }


  # DEFINE TRAINING/VALIDATION/TESTING SUBSETS
  N <- nrow(X);   P <- ncol(X);   S <- ifelse(is.null(Z), 0, ncol(Z))
  splits <- dataSplit(y, train_size, valid_size, test_size, reps, balanced)
  id_train <- splits$id_train;   N_train <- length(id_train[[1]])
  id_valid <- splits$id_valid;   N_valid <- length(id_valid[[1]])
  id_test  <- splits$id_test;    N_test  <- length(id_test[[1]])
  #check: truehist(y[id_test[[1]]], nbins=10, p=F)


  # DEFINE NUMBER OF PCA/PLS COMPONENT GRID
  prioriMaxComponents <- 80
  actualMaxComponents <- min(prioriMaxComponents, N_train-1)
  if(!is.null(Q_opt)){
    if(Q_opt <= min(prioriMaxComponents, N_train-1)){
      Q_vec <- Q_max <- Q_opt
    }else{
      Q_vec <- Q_max <- Q_opt <- min(prioriMaxComponents, N_train-1)
      warning("Input 'Q_opt' is too large and was automatically reduced.")}
  }else{
    if(is.null(Q_vec)){
      if(is.null(Q_len)){
        Q_len <- 30 }
      Q_vec <- floor(seq(2, actualMaxComponents, len=Q_len))
    }else{
      Q_vec <- floor(Q_vec[Q_vec <= (N_train-1)])
      warning(paste0("Input 'Q_vec' contains elements that are too large and which will be ignored. 'Q_vec' is now of length ",length(Q_vec),".")) }}
  Q_vec <- floor(unique(Q_vec))  # rounding can cause doubling
  Q_len <- length(Q_vec)


  # DEFINE REGULARISATION PARAMETER GRID
  if(lam_cv_type != "n"){
    if( is.null(lam_vec) ){
      lam_min <- 0.001;   lam_max <- 20
      lam_vec <- exp(seq(log(lam_min), log(lam_max), len=10)) }}


  # DEFINE MODEL FAMILY
  if(task == "clas"){
    if(length(unique(y)) > 2){
      family <- "multinomial"
    }else if(length(unique(y)) == 2){
      family <- "binomial"
    }else{
      stop("response 'y' must have two or more levels or unique values.") }
  }else if(task == "regr"){
    family <- "gaussian" }


  # USE SMOOTHED SPECTRA
  if(smooth_spectra){
    X_original <- X
    X <- fdaSmooth(X, wgts=smooth_w, wvls=350:2500)
  }else{
    X_original <- "same as X" }


  # USE FUNCTIONAL REPRESENTATION OF SPECTRA
  if(reduction == "n"){
    breaks <- seq(t_range[1], t_range[length(t_range)], len = P - 2)
  }else if(reduction == "pca" | reduction == "pls"){
    breaks <- seq(t_range[1], t_range[length(t_range)], len = bspline_dim-2) }
  B    <- bsplineS(t_range, breaks, norder=4, returnMatrix=F)
  B_d2 <- bsplineS(t_range, breaks, norder=4, returnMatrix=F, nderiv=2) # 2nd derivative
  PtP <- t(B_d2) %*% B_d2
  XB <- X %*% t(t(B) / colSums(B))
  # ob <- 1; plot(t_range, X[ob,], type="l"); lines(breaks, XB[ob,-c(1,ncol(XB))], col=2)
  # matplot(t(XB), type="l", col=rgb(0_6,0_6,0_6,0_8)); lines(colMeans(XB[y==1,]), col="red"); lines(colMeans(XB[y==0,]), col="blue")


  # INITIALISATIONS
  y_testpred_opt  <- matrix(0, N_test, reps)
  y_trainpred_opt <- matrix(0, N_train, reps)
  y_validpred_opt <- matrix(0, N_valid, reps)
  resid_test  <- matrix(0, N_test, reps)
  resid_train <- matrix(0, N_train, reps)
  resid_valid <- matrix(0, N_valid, reps)
  err_train <- err_valid <- err_test <- rep(NA,reps)
  perf_cv <- perf_cv_dichotomised <- lam_cv <- messed_up_lambdas <- matrix(NA, reps, Q_len)
  rownames(perf_cv) <- rownames(lam_cv) <- rownames(messed_up_lambdas) <- paste("r",1:reps,sep="")
  colnames(perf_cv) <- colnames(lam_cv) <- colnames(messed_up_lambdas) <- paste("k",1:Q_len,sep="")
  perf_cv_ALT <- array(NA, dim=c(reps, Q_len, nlvls), dimnames=list(paste0("r",1:reps), paste0("k",1:Q_len), paste0("l",lvls)))
  AUC_opt <- matrix(0, reps, 3)
  colnames(AUC_opt) <- c("train","valid","test")
  classProbs <- ROC <- ROC_opt <- ROC_to_plot_binom <- ROC_to_plot_multinom <- mOpt <- D_opt <- U_opt <- cenX <- cenY <- scaX <- scaY <- XBD_train_opt <- XBD_valid_opt <- XBD_test_opt <- penmat_opt <- list()
  if(family == "multinomial"){ for(q in 1:nlvls){ ROC_to_plot_multinom[[q]] <- list() }; names(ROC_to_plot_multinom) <- lvls }
  bias_test <- matrix(0, nrow=reps, ncol=nlvls)
  colnames(bias_test) <- paste0("age",lvls)
  linear_predictor <- matrix(0, N_test, reps)
  alpha <- rep(0, reps)
  beta <- matrix(0, ncol(XB), reps)
  gamma <- if(is.null(Z)){ NULL }else{ matrix(0, S, reps) }
  elnet <- 0  # 0=ridge, 1=lasso
  Z_pvalue <- matrix(0, reps, S)
  deviance_opt <- matrix(0, N_train, reps)


  #===============================================#
  # TRAINING and CROSS-VALIDATION for (Q,lambda)  #
  #===============================================#

  if(verbose){
    cat("","TRAINING and CROSS-VALIDATION:", sep="\n")
    pb = txtProgressBar(min = 0, max = reps, initial = 0, style = 3)
  }

  for(rr in 1:reps){


    # initialisations
    ROC[[rr]] <- list()
    wgts <- GET_weights(weighted_estimation, estimation_w, yy = y, id=id_train[[rr]], uniq = lvls)


    # data transformations
    # X (spectra)
    cenX[[rr]] <- colMeans(XB[id_train[[rr]],])
    scaX[[rr]] <- apply(XB[id_train[[rr]],], 2, sd)
    XB_train <- scale(XB[id_train[[rr]],], center = cenX[[rr]], scale=F)
    XB_valid <- scale(XB[id_valid[[rr]],], center = cenX[[rr]], scale=F)
    XB_test  <- scale(XB[id_test[[rr]],],  center = cenX[[rr]], scale=F)
    # y (response)
    if(task == "clas"){
      cenY[[rr]] <- NA
      scaY[[rr]] <- NA
      y_train  <- y[id_train[[rr]]]
      y_valid  <- y[id_valid[[rr]]]
      y_test   <- y[id_test[[rr]]]
    }else{
      cenY[[rr]] <- mean(y[id_train[[rr]]])
      scaY[[rr]] <- sd(y[id_train[[rr]]])
      y_train  <- scale(y[id_train[[rr]]],   center = cenY[[rr]], scale=F)
      y_valid  <- scale(y[id_valid[[rr]]],   center = cenY[[rr]], scale=F)
      y_test   <- scale(y[id_test[[rr]]],    center = cenY[[rr]], scale=F)
    }
    # Z (exogenous variables)
    if(!is.null(Z)){
      Z_train <- Z[id_train[[rr]],,drop=F]
      Z_valid <- Z[id_valid[[rr]],,drop=F]
      Z_test  <- Z[id_test[[rr]] ,,drop=F]
    }


    # dimension reduction
    if(reduction == "pca"){
      svdX <- svd(XB_train)
      D <- svdX$v                   #[loadings] = $rotation in prcomp()  #==#OR#==#  eigX <- eigen(t(XB_train) %*% XB_train); D <- eigX$vectors
      XBD_train <-  XB_train %*% D    #[scores] = XB_train %*% D = $x in prcomp()
    }else if(reduction == "pls"){
      plsX <- plsr(c(y_train) ~ -1 + XB_train, ncomp=min(ncol(XB_train), N_train)-1, method="simpls", validation="none")
      D <- plsX$projection      # [loadings][help(simpls_fit): "the projection matrix used to convert X to scores"]
      XBD_train <- plsX$scores  # = XB_train %*% D  [scores]
      # m1 <- plsr(c(y_train) ~ -1 + XB_train, ncomp=N_train-1, validation="LOO", method="simpls")
      # plot(RMSEP(m1), legendpos = "topright")
      # plot(m1, ncomp = 120, asp = 1, line = TRUE)
    }else if(reduction == "none"){
      stop("*---* need to code D downstream when no reduction is applied *---*")
      D <- diag(ncol(XB_train))
      XBD_train <- XB_train
      XBD_test  <- XB_test
    }


    if(reduction == "n"){
      stop("Estimation without spectra reduction (pca/pls) is possible but not implemented yet.")
    }else{
      for(qq in 1:Q_len){
        Qcomp <- ifelse(is.null(Q_opt), Q_vec[qq], Q_opt)
        D_Q <- D[,1:Qcomp,drop=F]
        XBD_trainQ <- XBD_train[,1:Qcomp]
        XBD_validQ <- XB_valid %*% D_Q
        if(!is.null(Z)){
          XBD_trainQ <- cbind(Z_train, XBD_trainQ)
          XBD_validQ <- cbind(Z_valid, XBD_validQ)
        }
        penmat <- drop(t(D_Q) %*% PtP %*% D_Q)
        optionsEst = list(intercept=intercept, S=S, task=task, model=model, fam=family, lam_cv_type=lam_cv_type, wgts=wgts, penmat=penmat, lam_vec=lam_vec, glmcv=glmcv, test=F, elnet=elnet)
        if((model == "glm") & (lam_cv_type != "n")){ optionsEst$penmat <- penmat }   #optionsEst$D <- D_Q; optionsEst$PtP <- PtP}
        lam_cv[rr,qq] <- GET_lambda(XBD_trainQ, XBD_validQ, y_train, y_valid, optionsEst)
        optionsEst$lam <- lam_cv[rr,qq]
        mQ <- fdaEstimation(XX = XBD_trainQ, yy = y_train, optionsEst = optionsEst)
        optionsPred <- list(intercept=intercept, task=task, model=model, fam=family, lam_cv_type=lam_cv_type, lam=lam_cv[rr,qq], predType="class")
        y_validpred <- fdaPrediction(m = mQ, newX = XBD_validQ, optionsPred = optionsPred)
        if(task == "clas"){
          if(family == "binomial"){
            perf_cv[rr,qq] <- GET_auc(y_pred = y_validpred, y_true = y_valid, f = family)
          }else if(family == "multinomial"){
            perf_cv[rr,qq] <- GET_auc(y_pred = y_validpred, y_true = y_valid, f = family)
            for(lvls_id in 1:nlvls){
              y_lvl <- as.numeric(y_train == lvls[lvls_id])
              m_lvl <- fdaEstimation(XX=XBD_trainQ, yy=y_lvl, optionsEst = list(model=model, fam="binomial", wgts=wgts, penmat = penmat, lam = lam_cv[rr,qq], lam_cv_type=lam_cv_type, glmcv=glmcv, test=T, elnet=elnet))
              optionsPred_multinom <- optionsPred;  optionsPred_multinom$fam <- "binomial"
              y_pred_lvl <- fdaPrediction(m = m_lvl, newX = XBD_validQ, optionsPred = optionsPred_multinom)
              pred_lvl <- prediction(y_pred_lvl, as.numeric(y_valid == lvls[lvls_id]))
              perf_lvl <- performance(pred_lvl, "auc")
              perf_cv_ALT[rr,qq,paste0("l",lvls[lvls_id])] <- unlist(slot(performance(pred_lvl, "auc"), "y.values"))
            }
          }
        }else if(task == "regr"){
          perf_cv[rr,qq] <- GET_rmsd(y_valid, y_validpred)
        }
        if(family == "multinomial" & lam_cv_type != "n"){ messed_up_lambdas[rr,qq] <- mQ$lambda }
      }#qq
    }#reduction
    if(verbose){
      setTxtProgressBar(pb,rr)
    }
  }#rr

  if((task == "clas") & (family == "multinomial")){
    for(rr in 1:reps){
      for(qq in 1:Q_len){
        perf_cv_dichotomised[rr,qq] <- mean(perf_cv_ALT[rr,qq,])
  }}}


  #===============================================#
  #        TESTING for optimal (Q,lambda)         #
  #===============================================#


  if(is.null(Q_opt)){
    Q_opt_id <- GET_IdxQ_opt(perf_cv, model, threshold=tau_Q_opt)
    Q_opt <- Q_vec[Q_opt_id]
    Q_max_id <- GET_IdxQ_opt(perf_cv, model, threshold=0)
    Q_max <- Q_vec[Q_max_id]
  }else{
    Q_opt_id <- 1
  }
  lam_opt <- mean(lam_cv[,Q_opt_id])
  PCA_variationExplained <- matrix(NA, Q_opt, reps); beta_reduced <- matrix(NA, Q_opt, reps)

  # coefficients' positions
  coef_pos <- matrix(NA,3,2); rownames(coef_pos) <- c("alpha", "gamma", "beta"); colnames(coef_pos) <- c("from", "to")
  if(intercept==T & S != 0){
    coef_pos["alpha",] <- c(1,1)
    coef_pos["gamma",] <- c(2,S+1)
    coef_pos["beta", ] <- c(S+2,1+S+Q_opt)
  }else if(intercept==T & S == 0){
    coef_pos["alpha",] <- c(1,1)
    coef_pos["gamma",] <- c(NA,NA)
    coef_pos["beta", ] <- c(2,1+Q_opt)
  }else if(intercept==F & S != 0){
    coef_pos["alpha",] <- c(NA,NA)
    coef_pos["gamma",] <- c(1,S)
    coef_pos["beta", ] <- c(S+1,S+Q_opt)
  }else if(intercept==F & S == 0){
    coef_pos["alpha",] <- c(NA,NA)
    coef_pos["gamma",] <- c(NA,NA)
    coef_pos["beta", ] <- c(1,Q_opt)
  }

  if(verbose){
    cat("","TESTING:", sep="\n")
    pb = txtProgressBar(min = 0, max = reps, initial = 0, style = 3)
  }

  for(rr in 1:reps){
    ROC_opt[[rr]] <- list()
    XB_train_opt <- scale(XB[id_train[[rr]],], center=cenX[[rr]], scale=F)
    y_train_opt <- if(task == "clas"){ y[id_train[[rr]]] }else{ scale(y[id_train[[rr]]], center=cenY[[rr]], scale=F) }
    if(reduction == "none"){
      ;
    }else if(reduction == "pca"){
      svdX_opt <- svd(XB_train_opt)
      D_opt[[rr]] <- svdX_opt$v[,1:Q_opt]
      XBD_train_opt[[rr]] <- XB_train_opt %*% D_opt[[rr]]
      PCA_variationExplained[,rr] <- svdX_opt$d[1:Q_opt] / sum(svdX_opt$d[1:Q_opt])
    }else if(reduction == "pls"){
      plsX <- plsr(c(y_train_opt) ~ -1 + XB_train_opt, ncomp=min(ncol(XB_train), N_train)-1, method="simpls", validation="none")
      D_opt[[rr]] <- plsX$projection[,1:Q_opt]
      XBD_train_opt[[rr]] <- plsX$scores[,1:Q_opt]
    }
    XBD_valid_opt[[rr]] <- scale(XB[id_valid[[rr]],], cen=cenX[[rr]],scale=F)  %*% D_opt[[rr]]
    XBD_test_opt[[rr]]  <- scale(XB[id_test[[rr]],],  cen=cenX[[rr]],scale=F)  %*% D_opt[[rr]]
    # add exogenous variables
    if(!is.null(Z)){
      XBD_train_opt[[rr]] <- cbind(Z[id_train[[rr]],,drop=F], XBD_train_opt[[rr]])
      XBD_valid_opt[[rr]] <- cbind(Z[id_valid[[rr]],,drop=F], XBD_valid_opt[[rr]])
      XBD_test_opt[[rr]]  <- cbind(Z[id_test[[rr]],, drop=F], XBD_test_opt[[rr]])
    }
    penmat_opt[[rr]] <- t(D_opt[[rr]]) %*% PtP %*% D_opt[[rr]]
    mOpt[[rr]] <- fdaEstimation(XX=XBD_train_opt[[rr]], yy=y_train_opt, optionsEst = list(intercept=intercept, S=S, lam_cv_type=lam_cv_type, task=task, model=model, fam=family, wgts=wgts, penmat = penmat_opt[[rr]], lam = lam_opt, glmcv=glmcv, test=T, elnet=elnet))  #coeffs_opt <- solve( StS + lam_cv[rr,which(Q_opt==Q_vec)] * t(D_opt) %*% PtP %*% D_opt ) %*% t(XBD_train_opt[[rr]]) %*% y_train_opt
    if(!is.null(Z)){# p-values for exogenous variables; glmnet() doesn't give std errors by default, thus using glm() here
      Z_pvalue[rr,] <- summary(glm(as.numeric(y_train_opt) ~ 1 + XBD_train_opt[[rr]]))$coefficients[coef_pos["gamma","from"]:coef_pos["gamma","to"], "Pr(>|t|)"]
    }
    optionsPred = list(intercept=intercept, lam_cv_type=lam_cv_type, task=task, model=model, fam=family, lam = lam_opt)
    y_trainpred_opt[,rr] <- fdaPrediction(m = mOpt[[rr]], newX = XBD_train_opt[[rr]], optionsPred = optionsPred)   #XBD_train_opt[[rr]] %*% coeffs_opt
    y_validpred_opt[,rr] <- fdaPrediction(m = mOpt[[rr]], newX = XBD_valid_opt[[rr]], optionsPred = optionsPred)   #scale(XB[id_valid[[rr]],], cen=cenX[[rr]],scale=F)  %*% D_opt %*% coeffs_opt
    y_testpred_opt[,rr]  <- fdaPrediction(m = mOpt[[rr]], newX = XBD_test_opt[[rr]],  optionsPred = optionsPred)   #scale(XB[id_test[[rr]],],  cen=cenX[[rr]],scale=F)  %*% D_opt %*% coeffs_opt
    if(task == "regr"){
      # regression coefficients in the space of original predictors
      if(!is.na(coef_pos["alpha",1]))
        alpha[rr] <- mOpt[[rr]]$estimate[1]
      if(!is.na(coef_pos["gamma",1]))
        stop("Gamma not coded for fLM.")
      if(!is.na(coef_pos["beta",1])){
        beta_reduced[,rr] <- mOpt[[rr]]$estimate[coef_pos["beta","from"]:coef_pos["beta","to"]]   # PCA/PLS-reduced space
        beta[,rr] <- c(D_opt[[rr]] %*%  beta_reduced[,rr])                                        # original space
      }
      linear_predictor[,rr] <- alpha[rr] + c(XBD_test_opt[[rr]]  %*% cbind(gamma[,rr], beta_reduced[,rr]))  # 'XBD_test_opt[[rr]]' already includes Z-variables
      # error
      resid_train[,rr] <- y_train_opt - y_trainpred_opt[,rr]
      err_train[rr]    <- GET_rmsd(scale(y[id_train[[rr]]],center=cenY[[rr]],scale=F), y_trainpred_opt[,rr])
      resid_valid[,rr] <- scale(y[id_valid[[rr]]], cen=cenY[[rr]],scale=F) - y_validpred_opt[,rr]
      err_valid[rr]    <- GET_rmsd(scale(y[id_valid[[rr]]],center=cenY[[rr]],scale=F), y_validpred_opt[,rr])
      resid_test[,rr]  <- scale(y[id_test[[rr]]],  cen=cenY[[rr]],scale=F) - y_testpred_opt[,rr]
      err_test[rr]     <- GET_rmsd(scale(y[id_test[[rr]]],center=cenY[[rr]],scale=F), y_testpred_opt[,rr])
      # bias
      for(uu in 1:nlvls){
        id <- y[id_test[[rr]]] == lvls[uu]
        bias_test[rr,uu] <- mean(y_testpred_opt[id,rr]+cenY[[rr]] - lvls[uu])
      }
    }else if(task == "clas"){
      if(family == "binomial"){
        # regression coefficients in the space of original predictors and linear predictor (to plot distributions and cutoff)
        if(model == "glm"){
          if(lam_cv_type == "n"){
            if(!is.na(coef_pos["alpha",1]))
              alpha[rr] <- coefficients(mOpt[[rr]])["(Intercept)",]
            if(!is.na(coef_pos["gamma",1]))
              gamma[,rr] <- as.matrix(coefficients(mOpt[[rr]])[coef_pos["gamma","from"]:coef_pos["gamma","to"]])
            if(!is.na(coef_pos["beta",1])){
              beta_reduced[,rr] <- coefficients(mOpt[[rr]])[coef_pos["beta","from"]:coef_pos["beta","to"],]  # PCA/PLS-reduced space
              beta[,rr] <- c(D_opt[[rr]] %*% beta_reduced[,rr])                                              # original space
            }
          }else{ # when using penalisation, the output from fdaEstimation() is different, so need to distinguish
            if(!is.na(coef_pos["alpha",1]))
              alpha[rr] <- mOpt[[rr]]$estimate[1]
            if(!is.na(coef_pos["gamma",1]))
              gamma[,rr] <- mOpt[[rr]]$estimate[coef_pos["gamma","from"]:coef_pos["gamma","to"]]
            if(!is.na(coef_pos["beta",1])){
              beta_reduced[,rr] <- mOpt[[rr]]$estimate[coef_pos["beta","from"]:coef_pos["beta","to"]]  # PCA/PLS-reduced space
              beta[,rr] <- c(D_opt[[rr]] %*% beta_reduced[,rr])                                        # original space
            }
          }
          linear_predictor[,rr] <- alpha[rr] + c(XBD_test_opt[[rr]]  %*% rbind(gamma[,rr,drop=F], beta_reduced[,rr,drop=F]))  # 'XBD_test_opt[[rr]]' already includes Z-variables
          deviance_opt[,rr] <- (-1)^as.numeric(1 - y_train_opt > y_trainpred_opt[,rr]) * sqrt(-2 * (y_train_opt * log(y_trainpred_opt[,rr]) + (1-y_train_opt) * log(1 - y_trainpred_opt[,rr])))
        }
        # auc/roc
        AUC_opt[rr,] <- c(GET_auc(y_pred = y_trainpred_opt[,rr], y_true = y[id_train[[rr]]], f = family),
                          GET_auc(y_pred = y_validpred_opt[,rr], y_true = y[id_valid[[rr]]], f = family),
                          GET_auc(y_pred = y_testpred_opt[,rr],  y_true = y[id_test[[rr]]],  f = family))
        #ROC_opt[[rr]] <- list(train = performance(prediction(y_trainpred_opt[,rr], y[id_train[[rr]]]), "tpr", "fpr"),
        #                      valid = performance(prediction(y_validpred_opt[,rr], y[id_valid[[rr]]]), "tpr", "fpr"),
        #                      test  = performance(prediction(y_testpred_opt[,rr],  y[id_test[[rr]]]),  "tpr", "fpr"))
        ROC_to_plot_binom$pred_test[[rr]] <- y_testpred_opt[,rr]
        ROC_to_plot_binom$labels[[rr]]    <- y[id_test[[rr]]]
      }else if(family == "multinomial"){
        # get class probabilities
        classProbs[[rr]] <- drop(fdaPrediction(m = mOpt[[rr]], newX = scale(XB[id_test[[rr]],],  cen=cenX[[rr]],scale=F)  %*% D_opt[[rr]], optionsPred = list(fam=family, lam_cv_type=lam_cv_type, predType="probs")))
        # regression coefficients in the space of original predictors
        ;
        # auc/roc
        AUC_opt[rr,] <- c(GET_auc(y_pred = y_trainpred_opt[,rr], y_true = y[id_train[[rr]]], f = family),
                          GET_auc(y_pred = y_validpred_opt[,rr], y_true = y[id_valid[[rr]]], f = family),
                          GET_auc(y_pred = y_testpred_opt[,rr],  y_true = y[id_test[[rr]]],  f = family))
        for(lvls_id in 1:nlvls){# see https://stats.stackexchange.com/questions/2151/how-to-plot-roc-curves-in-multiclass-classification and especially https://stats.stackexchange.com/questions/71700/how-to-draw-roc-curve-with-three-response-variable/110550#110550
          y_opt_lvl <- as.numeric(y_train_opt == lvls[lvls_id])
          mOpt_lvl <- fdaEstimation(XX=XBD_train_opt[[rr]], yy=y_opt_lvl, optionsEst = list(intercept=intercept, lam_cv_type=lam_cv_type, model=model, fam="binomial", wgts=wgts, penmat = penmat_opt[[rr]], lam = lam_opt, glmcv=glmcv, test=T, elnet=elnet))  #coeffs_opt <- solve( StS + lam_cv[rr,which(Q_opt==Q_vec)] * t(D_opt) %*% PtP %*% D_opt ) %*% t(XBD_train_opt[[rr]]) %*% y_train_opt
          y_testpred_opt_lvl <- fdaPrediction(m = mOpt_lvl, newX = scale(XB[id_test[[rr]],], cen=cenX[[rr]],scale=F) %*% D_opt[[rr]], optionsPred = list(fam="binomial", lam_cv_type=lam_cv_type))   #scale(XB[id_test[[rr]],],  cen=cenX[[rr]],scale=F)  %*% D_opt %*% coeffs_opt
          ROC_to_plot_multinom[[as.character(lvls[lvls_id])]]$pred_test[[rr]] <- y_testpred_opt_lvl
          ROC_to_plot_multinom[[as.character(lvls[lvls_id])]]$labels[[rr]]    <- as.numeric(y[id_test[[rr]]] == lvls[lvls_id])
        }
      }#family
    }#task
    if(verbose){
      setTxtProgressBar(pb,rr)
    }
  }#rr
  if(verbose){
    cat("","", sep="\n")
  }


  # OUTPUT
  names(y_testpred_opt) <- names(y_validpred_opt) <- names(y_trainpred_opt) <- NULL;

  ret <- structure( list(perf_cv = perf_cv,   # AUC for 'glm', RMSD for 'lm'
                         perf_cv_dichotomised = perf_cv_dichotomised,
                         lam_vec = lam_vec,
                         lam_cv = lam_cv,
                         lam_opt = lam_opt,
                         m_opt = mOpt,
                         Q_len = Q_len,
                         Q_vec = Q_vec,
                         Q_opt = Q_opt,
                         Q_max = Q_max,
                         classProbs = classProbs,
                         AUC_opt = AUC_opt,
                         ROC_opt = ROC_opt,
                         ROC_to_plot_binom = ROC_to_plot_binom,
                         ROC_to_plot_multinom = ROC_to_plot_multinom,
                         PCA_variationExplained = PCA_variationExplained,
                         alpha = alpha,
                         gamma = gamma,
                         beta_reduced = beta_reduced,
                         beta = beta,
                         beta_roughness = apply(beta, 2, GET_roughness, std=F),
                         beta_roughnessStd = apply(beta, 2, GET_roughness, std=T),
                         beta_avg_roughness = GET_roughness(rowMeans(beta), std=F),
                         beta_avg_roughnessStd = GET_roughness(rowMeans(beta), std=T),
                         coef_pos = coef_pos,
                         id_train = id_train,
                         id_valid = id_valid,
                         id_test = id_test,
                         N_train = N_train,
                         N_valid = N_valid,
                         N_test = N_test,
                         X_original = X_original,
                         X = X,
                         y = y,
                         Z = Z,
                         S = S,
                         Z_pvalue = Z_pvalue,
                         deviance_opt = deviance_opt,
                         breaks = breaks,
                         B = B,
                         B_d2 = B_d2,
                         XB = XB,
                         D_opt = D_opt,
                         penmat_opt = penmat_opt,
                         XBD_train_opt = XBD_train_opt,
                         linear_predictor = linear_predictor,
                         cenX = cenX,
                         cenY = cenY,
                         reps = reps,
                         smooth_spectra = smooth_spectra,
                         smooth_w = smooth_w,
                         weighted_estimation = weighted_estimation,
                         estimation_w = estimation_w,
                         y_trainpred_opt = y_trainpred_opt,
                         y_validpred_opt = y_validpred_opt,
                         y_testpred_opt  = y_testpred_opt,
                         err_train = err_train,
                         err_valid = err_valid,
                         err_test  = err_test,
                         bias_test = bias_test,
                         resid_train = resid_train,
                         resid_valid = resid_valid,
                         resid_test  = resid_test,
                         lam_cv_type = lam_cv_type,
                         task = task,
                         model = model,
                         family = family,
                         intercept = intercept,
                         t_range = t_range,
                         t_range_mod = seq(t_range[1], t_range[length(t_range)], len = bspline_dim) # t_range_mod may be different from t_range, depending on the number of breaks used to create the B-spline                         messed_up_lambdas = messed_up_lambdas
                         ), class="fdaModel")

  return(ret)
}
pmesperanca/mlevcm documentation built on March 17, 2021, 10:03 p.m.