R/sgpls_bootYT.R
In bootPLS: Bootstrap Hyperparameter Selection for PLS Models and Extensions

#' @title Bootstrap (Y,T) function for plsRglm
#'
#' @description A function passed to \code{boot} to perform bootstrap.
#' 
#' 
#' @param dataRepYtt Dataset with tt components to resample
#' @param ind indices for resampling
#' @param nt number of components to use
#' @param modele type of modele to use, see \link[plsRglm]{plsRglm}. Not used, 
#' please specify the family instead.
#' @param family glm family to use, see \link[plsRglm]{plsRglm}
#' @param maxcoefvalues maximum values allowed for the estimates of the
#' coefficients to discard those coming from singular bootstrap samples
#' @param ifbootfail value to return if the estimation fails on a bootstrap
#' sample
#' @return Numeric vector of the components computed using a bootstrap 
#' resampling or \code{ifbootfail} value if the
#' bootstrap computation fails.
#' @export coefs.sgpls.CSim
#'
#' @author Jérémy Magnanensi, Frédéric Bertrand\cr
#' \email{frederic.bertrand@@utt.fr}\cr
#' \url{https://fbertran.github.io/homepage/}
#' 
#' @references A new bootstrap-based stopping criterion in PLS component construction,
#' J. Magnanensi, M. Maumy-Bertrand, N. Meyer and F. Bertrand (2016), in The Multiple Facets of Partial Least Squares and Related Methods, 
#' \doi{10.1007/978-3-319-40643-5_18}\cr
#' 
#' A new universal resample-stable bootstrap-based stopping criterion for PLS component construction,
#' J. Magnanensi, F. Bertrand, M. Maumy-Bertrand and N. Meyer, (2017), Statistics and Computing, 27, 757–774. 
#' \doi{10.1007/s11222-016-9651-4}\cr
#' 
#' New developments in Sparse PLS regression, J. Magnanensi, M. Maumy-Bertrand, 
#' N. Meyer and F. Bertrand, (2021), Frontiers in Applied Mathematics and Statistics, 
#' \doi{10.3389/fams.2021.693126}\cr.
#'
#' @examples
#' set.seed(4619)
#' xran=cbind(rbinom(30,1,.2),matrix(rnorm(150),30,5))
#' coefs.sgpls.CSim(xran, ind=sample(1:nrow(xran)), 
#' maxcoefvalues=1e5, ifbootfail=rep(NA,3))
#' 
coefs.sgpls.CSim<-function (dataRepYtt, ind, nt, modele, family = binomial, maxcoefvalues,
                            ifbootfail)
{
  dataRepYb = dataRepYtt[ind, 1]
  Tb = dataRepYtt[ind, -1]
  
  tempCb = try(glm(dataRepYb ~ Tb, family = family),
               silent = TRUE)
  tempcoefs <- tempCb$coefficients[-1]
  Cond <- FALSE
  try(Cond <- is.numeric(tempcoefs) & all(abs(tempcoefs) <
                                            maxcoefvalues), silent = TRUE)
  if (Cond) {
    return(tempcoefs[nt])
  }
  else {
    return(ifbootfail)
  }
}

#' @title Permutation Bootstrap (Y,T) function for plsRglm
#'
#' @param dataRepYtt Dataset with tt components to resample
#' @param ind indices for resampling
#' @param nt number of components to use
#' @param modele type of modele to use, see \link[plsRglm]{plsRglm}. Not used, 
#' please specify the family instead.
#' @param family glm family to use, see \link[plsRglm]{plsRglm}
#' @param maxcoefvalues maximum values allowed for the estimates of the
#' coefficients to discard those coming from singular bootstrap samples
#' @param ifbootfail value to return if the estimation fails on a bootstrap
#' sample
#' @return Numeric vector of the components computed using a bootstrap 
#' resampling or \code{ifbootfail} value if the
#' bootstrap computation fails.
#' @export
#'
#' @author Jérémy Magnanensi, Frédéric Bertrand\cr
#' \email{frederic.bertrand@@utt.fr}\cr
#' \url{https://fbertran.github.io/homepage/}
#' 
#' @references A new bootstrap-based stopping criterion in PLS component construction,
#' J. Magnanensi, M. Maumy-Bertrand, N. Meyer and F. Bertrand (2016), in The Multiple Facets of Partial Least Squares and Related Methods, 
#' \doi{10.1007/978-3-319-40643-5_18}\cr
#' 
#' A new universal resample-stable bootstrap-based stopping criterion for PLS component construction,
#' J. Magnanensi, F. Bertrand, M. Maumy-Bertrand and N. Meyer, (2017), Statistics and Computing, 27, 757–774. 
#' \doi{10.1007/s11222-016-9651-4}\cr
#' 
#' New developments in Sparse PLS regression, J. Magnanensi, M. Maumy-Bertrand, 
#' N. Meyer and F. Bertrand, (2021), Frontiers in Applied Mathematics and Statistics, 
#' \doi{10.3389/fams.2021.693126}\cr.
#'
#' @examples
#' set.seed(4619)
#' xran=cbind(rbinom(30,1,.2),matrix(rnorm(150),30,5))
#' permcoefs.sgpls.CSim(xran, ind=sample(1:nrow(xran)), maxcoefvalues=1e5, 
#' ifbootfail=rep(NA,3))
#' 
permcoefs.sgpls.CSim<-function (dataRepYtt, ind, nt, modele, family = binomial, maxcoefvalues,
                            ifbootfail)
{
  dataRepYb = dataRepYtt[, 1]
  Tb = dataRepYtt[ind, -1]
  
  tempCb = try(glm(dataRepYb ~ Tb, family = family),
               silent = TRUE)
  tempcoefs <- tempCb$coefficients[-1]
  Cond <- FALSE
  try(Cond <- is.numeric(tempcoefs) & all(abs(tempcoefs) <
                                            maxcoefvalues), silent = TRUE)
  if (Cond) {
    return(tempcoefs[nt])
  }
  else {
    return(ifbootfail)
  }
}


#' @title Number of components for SGPLS using (Y,T) bootstrap
#'
#' @param x Matrix of predictors.
#' @param y Vector or matrix of responses.
#' @param fold Number of fold for cross-validation
#' @param eta Thresholding parameter. eta should be between 0 and 1.
#' @param R Number of resamplings.
#' @param scale.x Scale predictors by dividing each predictor variable by 
#' its sample standard deviation?
#' @param maxnt Maximum number of components allowed in a spls model.
#' @param br Apply Firth's bias reduction procedure?
#' @param ftype Type of Firth's bias reduction procedure. Alternatives are 
#' "iden" (the approximated version) or "hat" (the original version). 
#' Default is "iden".
#' @param plot.it Plot the results.
#' @param typeBCa Include computation for BCa type interval.
#' @param stabvalue A value to hard threshold bootstrap estimates computed from
#' atypical resamplings.
#' @param verbose Additionnal information on the algorithm.
#'
#' @return List of four: error matrix, eta optimal, K optimal and the matrix 
#' of results.
#' @export
#'
#' @author Jérémy Magnanensi, Frédéric Bertrand\cr
#' \email{frederic.bertrand@@utt.fr}\cr
#' \url{https://fbertran.github.io/homepage/}
#' 
#' @references A new bootstrap-based stopping criterion in PLS component construction,
#' J. Magnanensi, M. Maumy-Bertrand, N. Meyer and F. Bertrand (2016), in The Multiple Facets of Partial Least Squares and Related Methods, 
#' \doi{10.1007/978-3-319-40643-5_18}\cr
#' 
#' A new universal resample-stable bootstrap-based stopping criterion for PLS component construction,
#' J. Magnanensi, F. Bertrand, M. Maumy-Bertrand and N. Meyer, (2017), Statistics and Computing, 27, 757–774. 
#' \doi{10.1007/s11222-016-9651-4}\cr
#' 
#' New developments in Sparse PLS regression, J. Magnanensi, M. Maumy-Bertrand, 
#' N. Meyer and F. Bertrand, (2021), Frontiers in Applied Mathematics and Statistics, 
#' \doi{10.3389/fams.2021.693126}\cr.
#' 
#' @examples
#' set.seed(4619)
#' data(prostate, package="spls")
#' nbcomp.bootsgpls((prostate$x)[,1:30], prostate$y, R=250, eta=0.2, maxnt=1, typeBCa = FALSE)
#' \donttest{
#' set.seed(4619)
#' data(prostate, package="spls")
#' nbcomp.bootsgpls(prostate$x, prostate$y, R=250, eta=c(0.2,0.6), typeBCa = FALSE)
#' }
nbcomp.bootsgpls=function (x, y, fold = 10, eta, R, scale.x = TRUE, maxnt=10, plot.it = TRUE, 
                           br = TRUE, ftype = "iden", typeBCa=TRUE, stabvalue = 1e+06, verbose=TRUE) 
{
  ifbootfail <- as.matrix(as.numeric(ifelse(any(class(y) == "factor"), rep(NA, ncol(x) + nlevels(y)), rep(NA, ncol(x)+1))))
  result.mat <- c()
  foldi <- cv.split(y, fold)
  x <- as.matrix(x)
  n <- nrow(x)
  p <- ncol(x)
  ip <- c(1:p)
  y <- as.matrix(y)
  q <- ncol(y)
  K.opti=rep(0,length(eta))
  ncolBoot <- 5+2*typeBCa
  for (lll in 1:length(eta)) {
    if(verbose){print(paste("eta =", eta[lll]))}
    
    ### Finding the number of components
    inter=0
    indK=1
    resK=sgpls.T(x, y, K = indK, eta = eta[lll], scale.x = scale.x, eps = 1e-05, denom.eps = 1e-20, 
                 zero.eps = 1e-05, maxstep = 100, br = br, ftype = ftype)
    if(verbose){print(paste("K =", indK))}
    maxcoefvalues <- stabvalue * abs(resK$CoeffC)
    compTsim=resK$tt
    databoot=cbind(y,compTsim)
    bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=indK, maxcoefvalues = maxcoefvalues, ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
    confYT=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
    
    while ((confYT[1,ncolBoot]>0) & (indK<maxnt+1)) {
      indK=indK+1             
      resK<-sgpls.T(x, y, K = indK, eta = eta[lll], scale.x = scale.x, eps = 1e-05, denom.eps = 1e-20, 
                    zero.eps = 1e-05, maxstep = 100, br = br, ftype = ftype)
      if(verbose){print(paste("K =", indK))}
      maxcoefvalues <- stabvalue * abs(resK$CoeffC)
      ind=1
      compTsim=resK$tt[,ind]
      databoot=cbind(y,compTsim)
      bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=ind, maxcoefvalues = maxcoefvalues[ind], ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
      confC=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
      while ((confC[1,ncolBoot]>0) & (ind<indK)){
        ind=ind+1
        compTsim=resK$tt[,1:ind]
        databoot=cbind(y,compTsim)
        bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=ind, maxcoefvalues = maxcoefvalues[1:ind], ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
        confC=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
      }
      if (ind!=indK){
        confYT=matrix(0, 1, ncolBoot+1)
      }else{
        confYT=confC
      }
    }
    K.opti[lll]=indK - 1
  }
  
  .fit.sgpls.binary <- function(eta.val, K.val) {
    mspemati <- rep(0, fold)
    Ai <- rep(0, fold)
    for (k in 1:fold) {
      omit <- foldi[[k]]
      train.x <- x[-omit, ]
      train.y <- y[-omit, ]
      test.x <- x[omit, ]
      test.y <- y[omit, ]
      sgpls.fit <- spls::sgpls(train.x, train.y, K = K.val, eta = eta.val, 
                               scale.x = scale.x, br = br, ftype = ftype)
      pred <- as.numeric(as.vector(predict(sgpls.fit, newx = test.x)))
      mspemati[k] <- mean(as.numeric(pred != test.y))
      Ai[k] <- mean(length(sgpls.fit$A))
    }
    mspe.ij <- c(mean(mspemati), mean(Ai), eta.val, K.val)
    return(mspe.ij)
  }
  result.mat=matrix(0,length(eta),4)
  for (i in 1:length(eta)){
    result.mat[i,] <- .fit.sgpls.binary(eta.val = eta[i],K.val = K.opti[i]) 
  }                                                                
  mspemat <- matrix(result.mat[, 1], length(eta),1)
  rownames(mspemat) <- paste("eta=",eta,"; K=",K.opti)
  colnames(mspemat) <- ""
  cands <- result.mat[result.mat[, 1] == min(result.mat[, 1]), 
                      , drop = FALSE]
  cands <- cands[cands[, 2] == min(cands[, 2]), , drop = FALSE]
  cands <- cands[cands[, 4] == min(cands[, 4]), , drop = FALSE]
  cands <- cands[cands[, 3] == max(cands[, 3]), , drop = FALSE]
  K.opt <- cands[, 4]
  eta.opt <- cands[, 3]
  if(verbose){cat(paste("\nOptimal parameters: eta = ", eta.opt, ", ", 
                        sep = ""))}
  if(verbose){cat(paste("K = ", K.opt, "\n", sep = ""))}
  if (plot.it) {
    oldpar <- par(no.readonly = TRUE)
    on.exit(par(oldpar))
    par(mar=c(5,7,4,2))
    spls::heatmap.spls(t(mspemat), main = "CV error Plot", 
                       coln = 16, as = "n")
  }
  rownames(mspemat) <- paste("eta=",eta,"; K=",K.opti)
  colnames(mspemat) <- ""
  cv <- list(err.mat = mspemat, eta.opt = eta.opt, K.opt = K.opt, cands=cands)
  return(cv)
}


#' @title Number of components for SGPLS using (Y,T) bootstrap 
#' (parallel version)
#'
#' @param x Matrix of predictors.
#' @param y Vector or matrix of responses.
#' @param fold Number of fold for cross-validation.
#' @param eta Thresholding parameter. eta should be between 0 and 1.
#' @param R Number of resamplings.
#' @param scale.x Scale predictors by dividing each predictor variable by 
#' its sample standard deviation?
#' @param maxnt Maximum number of components allowed in a spls model.
#' @param br Apply Firth's bias reduction procedure?
#' @param ftype Type of Firth's bias reduction procedure. Alternatives are 
#' "iden" (the approximated version) or "hat" (the original version). 
#' Default is "iden".
#' @param ncpus Number of cpus for parallel computing.
#' @param plot.it Plot the results.
#' @param typeBCa Include computation for BCa type interval.
#' @param stabvalue A value to hard threshold bootstrap estimates computed from
#' atypical resamplings.
#' @param verbose Additionnal information on the algorithm.
#'
#' @return List of four: error matrix, eta optimal, K optimal and the matrix 
#' of results.
#' @export
#'
#' @author Jérémy Magnanensi, Frédéric Bertrand\cr
#' \email{frederic.bertrand@@utt.fr}\cr
#' \url{https://fbertran.github.io/homepage/}
#' 
#' @references A new bootstrap-based stopping criterion in PLS component construction,
#' J. Magnanensi, M. Maumy-Bertrand, N. Meyer and F. Bertrand (2016), in The Multiple Facets of Partial Least Squares and Related Methods, 
#' \doi{10.1007/978-3-319-40643-5_18}\cr
#' 
#' A new universal resample-stable bootstrap-based stopping criterion for PLS component construction,
#' J. Magnanensi, F. Bertrand, M. Maumy-Bertrand and N. Meyer, (2017), Statistics and Computing, 27, 757–774. 
#' \doi{10.1007/s11222-016-9651-4}\cr
#' 
#' New developments in Sparse PLS regression, J. Magnanensi, M. Maumy-Bertrand, 
#' N. Meyer and F. Bertrand, (2021), Frontiers in Applied Mathematics and Statistics, 
#' \doi{10.3389/fams.2021.693126}\cr.
#' 
#' @examples
#' set.seed(4619)
#' data(prostate, package="spls")
#' nbcomp.bootsgpls.para((prostate$x)[,1:30], prostate$y, R=250, eta=0.2, maxnt=1, typeBCa = FALSE)
#' \donttest{
#' set.seed(4619)
#' data(prostate, package="spls")
#' nbcomp.bootsgpls.para(prostate$x, prostate$y, R=250, eta=c(0.2,0.6), typeBCa = FALSE)
#' }
nbcomp.bootsgpls.para=function (x, y, fold = 10, eta, R, scale.x = TRUE, maxnt=10, 
                                br = TRUE, ftype = "iden", ncpus = 1, plot.it = TRUE, 
                                typeBCa=TRUE, stabvalue = 1e+06, verbose=TRUE) 
{
  lll<-NA
  ifbootfail <- as.matrix(as.numeric(ifelse(any(class(y) == "factor"), rep(NA, ncol(x) + nlevels(y)), rep(NA, ncol(x)+1))))
  result.mat <- c()
  foldi <- cv.split(y, fold)
  x <- as.matrix(x)
  n <- nrow(x)
  p <- ncol(x)
  ip <- c(1:p)
  y <- as.matrix(y)
  q <- ncol(y)
  K.opti=rep(0,length(eta))
  ncolBoot <- 5+2*typeBCa
  
  doParallel::registerDoParallel(cores=ncpus)
  requireNamespace("foreach",quietly = TRUE)
  par.K.opti=foreach::foreach(lll=1:length(eta), .combine="cbind", 
                              .export=c("eta", "ftype", "br", "scale.x", 
                                        "R", "maxnt", "stabvalue")) %dopar% {
                                          if(verbose){print(paste("eta =", eta[lll]))}
                                          
                                          ### Finding the number of components
                                          inter=0
                                          indK=1
                                          resK=sgpls.T(x, y, K = indK, eta = eta[lll], scale.x = scale.x, eps = 1e-05, denom.eps = 1e-20, 
                                                       zero.eps = 1e-05, maxstep = 100, br = br, ftype = ftype)
                                          if(verbose){print(paste("K =", indK))}
                                          maxcoefvalues <- stabvalue * abs(resK$CoeffC)
                                          compTsim=resK$tt
                                          databoot=cbind(y,compTsim)
                                          bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=indK, maxcoefvalues = maxcoefvalues, ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
                                          confYT=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
                                          
                                          while ((confYT[1,ncolBoot]>0) & (indK<maxnt+1)) {
                                            indK=indK+1             
                                            resK<-sgpls.T(x, y, K = indK, eta = eta[lll], scale.x = scale.x, eps = 1e-05, denom.eps = 1e-20, 
                                                          zero.eps = 1e-05, maxstep = 100, br = br, ftype = ftype)
                                            if(verbose){print(paste("K =", indK))}
                                            maxcoefvalues <- stabvalue * abs(resK$CoeffC)
                                            ind=1
                                            compTsim=resK$tt[,ind]
                                            databoot=cbind(y,compTsim)
                                            bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=ind, maxcoefvalues = maxcoefvalues[ind], ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
                                            confC=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
                                            while ((confC[1,ncolBoot]>0) & (ind<indK)){
                                              ind=ind+1
                                              compTsim=resK$tt[,1:ind]
                                              databoot=cbind(y,compTsim)
                                              bootcomp<-boot::boot(databoot, statistic=coefs.sgpls.CSim, nt=ind, maxcoefvalues = maxcoefvalues[1:ind], ifbootfail = ifbootfail, sim="ordinary", stype="i", R=R)
                                              confC=t(as.matrix(plsRglm::confints.bootpls(bootcomp, typeBCa = typeBCa)))
                                            }
                                            if (ind!=indK){
                                              confYT=matrix(0, 1, ncolBoot+1)
                                            }else{
                                              confYT=confC
                                            }
                                          }
                                          K.opti[lll]=indK - 1
                                        }
  doParallel::stopImplicitCluster()
  
  .fit.sgpls.binary <- function(eta.val, K.val) {
    mspemati <- rep(0, fold)
    Ai <- rep(0, fold)
    for (k in 1:fold) {
      omit <- foldi[[k]]
      train.x <- x[-omit, ]
      train.y <- y[-omit, ]
      test.x <- x[omit, ]
      test.y <- y[omit, ]
      sgpls.fit <- spls::sgpls(train.x, train.y, K = K.val, eta = eta.val, 
                               scale.x = scale.x, br = br, ftype = ftype)
      pred <- as.numeric(as.vector(predict(sgpls.fit, newx = test.x)))
      mspemati[k] <- mean(as.numeric(pred != test.y))
      Ai[k] <- mean(length(sgpls.fit$A))
    }
    mspe.ij <- c(mean(mspemati), mean(Ai), eta.val, K.val)
    return(mspe.ij)
  }
  result.mat=matrix(0,length(eta),4)
  for (i in 1:length(eta)){
    result.mat[i,] <- .fit.sgpls.binary(eta.val = eta[i],K.val = par.K.opti[i]) 
  }                                                                
  mspemat <- matrix(result.mat[, 1], length(eta),1)
  rownames(mspemat) <- paste("eta=",eta,"; K=",par.K.opti)
  colnames(mspemat) <- ""
  cands <- result.mat[result.mat[, 1] == min(result.mat[, 1]), 
                      , drop = FALSE]
  cands <- cands[cands[, 2] == min(cands[, 2]), , drop = FALSE]
  cands <- cands[cands[, 4] == min(cands[, 4]), , drop = FALSE]
  cands <- cands[cands[, 3] == max(cands[, 3]), , drop = FALSE]
  K.opt <- cands[, 4]
  eta.opt <- cands[, 3]
  if(verbose){cat(paste("\nOptimal parameters: eta = ", eta.opt, ", ", 
                        sep = ""))}
  if(verbose){cat(paste("K = ", K.opt, "\n", sep = ""))}
  if (plot.it) {
    oldpar <- par(no.readonly = TRUE)
    on.exit(par(oldpar))
    par(mar=c(5,7,4,2))
    spls::heatmap.spls(t(mspemat), main = "CV error Plot", 
                       coln = 16, as = "n")
  }
  rownames(mspemat) <- paste("eta=",eta,"; K=",par.K.opti)
  colnames(mspemat) <- ""
  cv <- list(err.mat = mspemat, eta.opt = eta.opt, K.opt = K.opt, cands=cands)
  return(cv)
}
Any scripts or data that you put into this service are public.
bootPLS documentation built on Sept. 30, 2024, 9:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
bootPLS
Bootstrap Hyperparameter Selection for PLS Models and Extensions

R/sgpls_bootYT.R
In bootPLS: Bootstrap Hyperparameter Selection for PLS Models and Extensions

Try the bootPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bootPLS Bootstrap Hyperparameter Selection for PLS Models and Extensions

R/sgpls_bootYT.R In bootPLS: Bootstrap Hyperparameter Selection for PLS Models and Extensions

Try the bootPLS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bootPLS
Bootstrap Hyperparameter Selection for PLS Models and Extensions

R/sgpls_bootYT.R
In bootPLS: Bootstrap Hyperparameter Selection for PLS Models and Extensions