R/perf.R

Defines functions perf.gPLSda perf.sgPLSda perf.sPLSda perf.sgPLS perf.gPLS perf.sPLS

Documented in perf.gPLS perf.gPLSda perf.sgPLS perf.sgPLSda perf.sPLS perf.sPLSda

# ---------------------------------------------------
# perf for sPLS object
# ---------------------------------------------------

perf.sPLS <-
  function(object,
           criterion = c("all","MSEP", "R2", "Q2"), 
           validation = c("Mfold", "loo"),
           folds = 10,
           progressBar = TRUE,setseed=1,
           ...)
  {
    set.seed(setseed)
    #-- validation des arguments --#
    # these are the centered and scaled matrices output from spls 
    X = object$X
    Y = object$Y
    
    tol = object$tol
    max.iter = object$max.iter
    
    # tells which variables are selected in X and in Y:
    keepX = object$keepX   
    keepY = object$keepY   
    
    mode = object$mode
    ncomp = object$ncomp
    n = nrow(X)
    p = ncol(X)
    q = ncol(Y)
    res = list()
    
    validation = match.arg(validation)
    
    # initialize new objects:= to record feature stability
    featuresX  = featuresY =  list()
    for(k in 1:ncomp){
      featuresX[[k]] = featuresY[[k]] = NA
    }
    
    
    if (length(dim(X)) != 2) 
      stop("'X' must be a numeric matrix for validation.")
    
    if(object$mode == 'canonical') stop('sPLS mode should be set to regression, invariant or classic')
    
    if (any(criterion == "Q2") & ncomp == 1)
      stop("'ncomp' must be > 1 for Q2 criterion.")
    
    if (any(is.na(X)) || any(is.na(Y))) 
      stop("Missing data in 'X' and/or 'Y'. Use 'nipals' for dealing with NAs.")
    
    
    #-- M fold or loo cross validation --#
    #- define the folds
    if (validation == "Mfold") {
      if (is.list(folds)) {
        if (length(folds) < 2 | length(folds) > n)
          stop("Invalid number of folds.")
        if (length(unique(unlist(folds))) != n)
          stop("Invalid folds.")
        
        M = length(folds)
      }
      else {
        if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
          stop("Invalid number of folds.")
        else {
          M = round(folds)
          folds = split(sample(1:n), rep(1:M, length = n)) 
        }
      }
    } 
    else { 
      folds = split(1:n, rep(1:n, length = n)) 
      M = n
    }
    
    
    #-- compute criteria ------------ --#
    RSS = rbind(rep(n - 1, q), matrix(nrow = ncomp, ncol = q))
    RSS.indiv = array(NA, c(n, q, ncomp+1))
    PRESS.inside = Q2.inside = matrix(nrow = ncomp, ncol = q)
    
    #KA: all criteria were included.
    if (any(criterion %in% c("all", "MSEP", "R2", "Q2"))) {
      press.mat = Ypred = array(NA, c(n, q, ncomp))
      MSEP = R2 = matrix(NA, nrow = q, ncol = ncomp)
      
      # set up dimnames
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside) = colnames(Y)
      dimnames(press.mat)[[2]] = colnames(Y)
      
      # in case the test set only includes one sample, it is better to advise the user to
      # perform loocv
      stop.user = FALSE
      if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
      
      for (i in 1:M) {
        if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
        
        omit = folds[[i]]
        # see below, we stop the user if there is only one sample drawn on the test set using MFold
        if(length(omit) == 1) stop.user = TRUE
        
        # the training set is NOT scaled
        X.train = X[-omit, ]
        Y.train = Y[-omit, ]
        X.test = matrix(X[omit, ], nrow = length(omit))
        Y.test = matrix(Y[omit, ], nrow = length(omit))
        
        
        #-- spls --#
        spls.res = sPLS(X.train, Y.train, ncomp, mode, max.iter, tol, keepX=keepX, keepY=keepY)     ## change
        
        # added: record selected features in each set
        for(k in 1:ncomp){
          featuresX[[k]] = c(unlist(featuresX[[k]]), selectVar(spls.res, comp = k)$X$name)
          featuresY[[k]] = c(unlist(featuresY[[k]]), selectVar(spls.res, comp = k)$Y$name)
        }
        
        
        #if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
        Y.hat = predict(spls.res, X.test)$predict
        
        #compute prediction of Y
        for (h in 1:ncomp) {
          Ypred[omit, , h] = Y.hat[, , h]
          
          # KA: this bunch was added:
          # compute the press and the RSS
          # by definition (tenenhaus), RSS[h+1,] = (y_i - y.hat_(h-1)_i)^2
          press.mat[omit, , h] = (Y.test - Y.hat[, , h])^2
          RSS.indiv[omit, ,h+1] = (Y.test - Y.hat[, , h])^2
        } # end h
      } #end i (cross validation)
      
      # KA added
      # warn the user that at least test set had a length of 1
      if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
      
      # these criteria are computed across all folds.
      for (h in 1:ncomp) { 
        MSEP[, h] = apply(as.matrix(press.mat[, , h]), 2, mean, na.rm = TRUE)
        R2[, h] = (diag(cor(Y, Ypred[,,h], use = "pairwise")))^2
        #KA:  PRESS is also computed as well as Q2 inside this procedure 
        if(q>1){
          RSS[h+1,] = t(apply(RSS.indiv[,,h+1], 2, sum))
          PRESS.inside[h, ] = colSums(press.mat[, , h], na.rm = TRUE)
        }else{
          RSS[h+1,q] = sum(RSS.indiv[,q,h+1])
          PRESS.inside[h, q] = sum(press.mat[,q, h], na.rm = TRUE)
        }
        
        Q2.inside[h, ] = 1 - PRESS.inside[h, ]/RSS[h, ]
        
      }
      
      colnames(MSEP) = colnames(R2) = rownames(Q2.inside) = paste('ncomp', c(1:ncomp), sep = " ")
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside)  = colnames(Y)
      
      if (q == 1) rownames(MSEP) = rownames(R2) = ""
      
      if(ncomp>1){
        # compute Q2 total
        if(q>1){
          Q2.total = 1 - rowSums(PRESS.inside, na.rm = TRUE)/rowSums(RSS[-(ncomp+1), ], na.rm = TRUE)
        }else{ # for q == 1
          Q2.total = t(1 - PRESS.inside/RSS[-(ncomp+1), ])
        }
      }else{
        Q2.total = NA
      }
      
      names(Q2.total) = paste('comp', 1:ncomp, sep = " ")
      
    } # end all, MSEP, R2
    
    if (progressBar == TRUE) cat('\n')
    
    
    # ---- extract stability of features ----- # NEW
    list.featuresX = list.featuresY =list()
    for(k in 1:ncomp){
      #remove the NA value that was added for initialisation
      remove.naX = which(is.na(featuresX[[k]]))
      remove.naY = which(is.na(featuresY[[k]]))
      # then summarise as a factor and output the percentage of appearance
      list.featuresX[[k]] = sort(summary(as.factor(featuresX[[k]][-remove.naX]))/M, decreasing = TRUE)
      list.featuresY[[k]] = sort(summary(as.factor(featuresY[[k]][-remove.naY]))/M, decreasing = TRUE)
    }
    
    # extract features selected from the full model ---------
    features.finalX = features.finalY =list()
    for(k in 1:ncomp){
      features.finalX[[k]] = selectVar(object, comp = k)$X$value
      features.finalY[[k]] = selectVar(object, comp = k)$Y$value
    }
    
    names(features.finalX)  = names(features.finalY) = names(list.featuresX) = names(list.featuresY) = paste('comp', 1:ncomp)
    
    
    # ----------  final outputs:
    if (any(criterion %in% c("all", "MSEP"))) res$MSEP = MSEP
    if (any(criterion %in% c("all", "R2"))) res$R2 = R2
    if (any(criterion %in% c("all", "Q2"))) res$Q2 = t(Q2.inside)
    if (any(criterion %in% c("all", "Q2"))) res$Q2.total = Q2.total
    
    
    #features
    res$features$stable.X = list.featuresX
    res$features$stable.Y = list.featuresY
    res$features$final.X = features.finalX
    res$features$final.Y = features.finalY
    
    res$press.mat=press.mat
    res$RSS.indiv=RSS.indiv
    res$PRESS.inside=PRESS.inside
    res$RSS=RSS
    
    method = "pls.mthd"
    class(res) = c("perf", method)
    return(invisible(res))
  }

# ---------------------------------------------------
# perf for gPLS object
# ---------------------------------------------------

perf.gPLS <-
  function(object,
           criterion = c("all","MSEP", "R2", "Q2"), 
           validation = c("Mfold", "loo"),
           folds = 10,
           progressBar = TRUE,setseed=1,
           ...)
  {
    set.seed(setseed)
    #-- validation des arguments --#
    # these are the centered and scaled matrices output from spls 
    X = object$X
    Y = object$Y
    
    tol = object$tol
    max.iter = object$max.iter
    
    # tells which variables are selected in X and in Y:
    keepX = object$keepX   
    keepY = object$keepY   
    
    mode = object$mode
    ncomp = object$ncomp
    ind.block.x = object$ind.block.x
    ind.block.y = object$ind.block.y
    
    
    
    n = nrow(X)
    p = ncol(X)
    q = ncol(Y)
    res = list()
    
    validation = match.arg(validation)
    
    # initialize new objects:= to record feature stability
    featuresX  = featuresY =  list()
    for(k in 1:ncomp){
      featuresX[[k]] = featuresY[[k]] = NA
    }
    
    
    if (length(dim(X)) != 2) 
      stop("'X' must be a numeric matrix for validation.")
    
    if(object$mode == 'canonical') stop('sPLS mode should be set to regression, invariant or classic')
    
    if (any(criterion == "Q2") & ncomp == 1)
      stop("'ncomp' must be > 1 for Q2 criterion.")
    
    if (any(is.na(X)) || any(is.na(Y))) 
      stop("Missing data in 'X' and/or 'Y'. Use 'nipals' for dealing with NAs.")
    
    
    #-- M fold or loo cross validation --#
    #- define the folds
    if (validation == "Mfold") {
      if (is.list(folds)) {
        if (length(folds) < 2 | length(folds) > n)
          stop("Invalid number of folds.")
        if (length(unique(unlist(folds))) != n)
          stop("Invalid folds.")
        
        M = length(folds)
      }
      else {
        if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
          stop("Invalid number of folds.")
        else {
          M = round(folds)
          folds = split(sample(1:n), rep(1:M, length = n)) 
        }
      }
    } 
    else { 
      folds = split(1:n, rep(1:n, length = n)) 
      M = n
    }
    
    
    #-- compute criteria ------------ --#
    RSS = rbind(rep(n - 1, q), matrix(nrow = ncomp, ncol = q))
    RSS.indiv = array(NA, c(n, q, ncomp+1))
    PRESS.inside = Q2.inside = matrix(nrow = ncomp, ncol = q)
    
    #KA: all criteria were included.
    if (any(criterion %in% c("all", "MSEP", "R2", "Q2"))) {
      press.mat = Ypred = array(NA, c(n, q, ncomp))
      MSEP = R2 = matrix(NA, nrow = q, ncol = ncomp)
      
      # set up dimnames
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside) = colnames(Y)
      dimnames(press.mat)[[2]] = colnames(Y)
      
      # in case the test set only includes one sample, it is better to advise the user to
      # perform loocv
      stop.user = FALSE
      if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
      
      for (i in 1:M) {
        if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
        
        omit = folds[[i]]
        # see below, we stop the user if there is only one sample drawn on the test set using MFold
        if(length(omit) == 1) stop.user = TRUE
        
        # the training set is NOT scaled
        X.train = X[-omit, ]
        Y.train = Y[-omit, ]
        X.test = matrix(X[omit, ], nrow = length(omit))
        Y.test = matrix(Y[omit, ], nrow = length(omit))
        
        #-- gpls --#
        spls.res = gPLS(X.train, Y.train, ncomp, mode, max.iter, tol, keepX=keepX, keepY=keepY,ind.block.x=ind.block.x,ind.block.y=ind.block.y)     ## change
        
        # added: record selected features in each set
        for(k in 1:ncomp){
          featuresX[[k]] = c(unlist(featuresX[[k]]), selectVar(spls.res, comp = k)$X$name)
          featuresY[[k]] = c(unlist(featuresY[[k]]), selectVar(spls.res, comp = k)$Y$name)
        }
        
        
        #if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
        Y.hat = predict.gPLS(spls.res, X.test)$predict
        
        #compute prediction of Y
        for (h in 1:ncomp) {
          Ypred[omit, , h] = Y.hat[, , h]
          
          # KA: this bunch was added:
          # compute the press and the RSS
          # by definition (tenenhaus), RSS[h+1,] = (y_i - y.hat_(h-1)_i)^2
          press.mat[omit, , h] = (Y.test - Y.hat[, , h])^2
          RSS.indiv[omit, ,h+1] = (Y.test - Y.hat[, , h])^2
        } # end h
      } #end i (cross validation)
      
      # KA added
      # warn the user that at least test set had a length of 1
      if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
      
      # these criteria are computed across all folds.
      for (h in 1:ncomp) { 
        MSEP[, h] = apply(as.matrix(press.mat[, , h]), 2, mean, na.rm = TRUE)
        R2[, h] = (diag(cor(Y, Ypred[,,h], use = "pairwise")))^2
        #KA:  PRESS is also computed as well as Q2 inside this procedure 
        if(q>1){
          RSS[h+1,] = t(apply(RSS.indiv[,,h+1], 2, sum))
          PRESS.inside[h, ] = colSums(press.mat[, , h], na.rm = TRUE)
        }else{
          RSS[h+1,q] = sum(RSS.indiv[,q,h+1])
          PRESS.inside[h, q] = sum(press.mat[,q, h], na.rm = TRUE)
        }
        
        Q2.inside[h, ] = 1 - PRESS.inside[h, ]/RSS[h, ]
        
      }
      
      colnames(MSEP) = colnames(R2) = rownames(Q2.inside) = paste('ncomp', c(1:ncomp), sep = " ")
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside)  = colnames(Y)
      
      if (q == 1) rownames(MSEP) = rownames(R2) = ""
      
      if(ncomp>1){
        # compute Q2 total
        if(q>1){
          Q2.total = 1 - rowSums(PRESS.inside, na.rm = TRUE)/rowSums(RSS[-(ncomp+1), ], na.rm = TRUE)
        }else{ # for q == 1
          Q2.total = t(1 - PRESS.inside/RSS[-(ncomp+1), ])
        }
      }else{
        Q2.total = NA
      }
      
      names(Q2.total) = paste('comp', 1:ncomp, sep = " ")
      
    } # end all, MSEP, R2
    
    if (progressBar == TRUE) cat('\n')
    
    
    # ---- extract stability of features ----- # NEW
    list.featuresX = list.featuresY =list()
    for(k in 1:ncomp){
      #remove the NA value that was added for initialisation
      remove.naX = which(is.na(featuresX[[k]]))
      remove.naY = which(is.na(featuresY[[k]]))
      # then summarise as a factor and output the percentage of appearance
      list.featuresX[[k]] = sort(summary(as.factor(featuresX[[k]][-remove.naX]))/M, decreasing = TRUE)
      list.featuresY[[k]] = sort(summary(as.factor(featuresY[[k]][-remove.naY]))/M, decreasing = TRUE)
    }
    
    # extract features selected from the full model ---------
    features.finalX = features.finalY =list()
    for(k in 1:ncomp){
      features.finalX[[k]] = selectVar(object, comp = k)$X$value
      features.finalY[[k]] = selectVar(object, comp = k)$Y$value
    }
    
    names(features.finalX)  = names(features.finalY) = names(list.featuresX) = names(list.featuresX) = paste('comp', 1:ncomp)
    
    
    # ----------  final outputs:
    if (any(criterion %in% c("all", "MSEP"))) res$MSEP = MSEP
    if (any(criterion %in% c("all", "R2"))) res$R2 = R2
    if (any(criterion %in% c("all", "Q2"))) res$Q2 = t(Q2.inside)
    if (any(criterion %in% c("all", "Q2"))) res$Q2.total = Q2.total
    
    
    #features
    res$features$stable.X = list.featuresX
    res$features$stable.Y = list.featuresY
    res$features$final.X = features.finalX
    res$features$final.Y = features.finalY
    
    res$press.mat=press.mat
    res$RSS.indiv=RSS.indiv
    res$PRESS.inside=PRESS.inside
    res$RSS=RSS
    
    method = "pls.mthd"
    class(res) = c("perf", method)
    return(invisible(res))
  }


# ---------------------------------------------------
# perf for sgPLS object
# ---------------------------------------------------




perf.sgPLS <-
  function(object,
           criterion = c("all","MSEP", "R2", "Q2"), 
           validation = c("Mfold", "loo"),
           folds = 10,
           progressBar = TRUE,setseed=1,
           ...)
  {
    set.seed(setseed)
    #-- validation des arguments --#
    # these are the centered and scaled matrices output from spls 
    X = object$X
    Y = object$Y
    
    tol = object$tol
    max.iter = object$max.iter
    
    # tells which variables are selected in X and in Y:
    keepX = object$keepX   
    keepY = object$keepY   
    
    mode = object$mode
    ncomp = object$ncomp
    ind.block.x = object$ind.block.x
    ind.block.y = object$ind.block.y
    alpha.x = object$alpha.x
    alpha.y = object$alpha.y
    upper.lambda = object$upper.lambda
    
    n = nrow(X)
    p = ncol(X)
    q = ncol(Y)
    res = list()
    
    validation = match.arg(validation)
    
    # initialize new objects:= to record feature stability
    featuresX  = featuresY =  list()
    for(k in 1:ncomp){
      featuresX[[k]] = featuresY[[k]] = NA
    }
    
    
    if (length(dim(X)) != 2) 
      stop("'X' must be a numeric matrix for validation.")
    
    if(object$mode == 'canonical') stop('sPLS mode should be set to regression, invariant or classic')
    
    if (any(criterion == "Q2") & ncomp == 1)
      stop("'ncomp' must be > 1 for Q2 criterion.")
    
    if (any(is.na(X)) || any(is.na(Y))) 
      stop("Missing data in 'X' and/or 'Y'. Use 'nipals' for dealing with NAs.")
    
    
    #-- M fold or loo cross validation --#
    #- define the folds
    if (validation == "Mfold") {
      if (is.list(folds)) {
        if (length(folds) < 2 | length(folds) > n)
          stop("Invalid number of folds.")
        if (length(unique(unlist(folds))) != n)
          stop("Invalid folds.")
        
        M = length(folds)
      }
      else {
        if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
          stop("Invalid number of folds.")
        else {
          M = round(folds)
          folds = split(sample(1:n), rep(1:M, length = n)) 
        }
      }
    } 
    else { 
      folds = split(1:n, rep(1:n, length = n)) 
      M = n
    }
    
    
    #-- compute criteria ------------ --#
    RSS = rbind(rep(n - 1, q), matrix(nrow = ncomp, ncol = q))
    RSS.indiv = array(NA, c(n, q, ncomp+1))
    PRESS.inside = Q2.inside = matrix(nrow = ncomp, ncol = q)
    
    #KA: all criteria were included.
    if (any(criterion %in% c("all", "MSEP", "R2", "Q2"))) {
      press.mat = Ypred = array(NA, c(n, q, ncomp))
      MSEP = R2 = matrix(NA, nrow = q, ncol = ncomp)
      
      # set up dimnames
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside) = colnames(Y)
      dimnames(press.mat)[[2]] = colnames(Y)
      
      # in case the test set only includes one sample, it is better to advise the user to
      # perform loocv
      stop.user = FALSE
      if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
      
      for (i in 1:M) {
        if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
        
        omit = folds[[i]]
        # see below, we stop the user if there is only one sample drawn on the test set using MFold
        if(length(omit) == 1) stop.user = TRUE
        
        # the training set is NOT scaled
        X.train = X[-omit, ]
        Y.train = Y[-omit, ]
        X.test = matrix(X[omit, ], nrow = length(omit))
        Y.test = matrix(Y[omit, ], nrow = length(omit))
        
        #-- sgpls --#
        spls.res = sgPLS(X.train, Y.train, ncomp, mode, max.iter, tol, keepX=keepX, keepY=keepY,ind.block.x=ind.block.x,ind.block.y=ind.block.y,alpha.x=alpha.x,alpha.y=alpha.y,upper.lambda=upper.lambda)     ## change
        #  Sparse.Group.spls.BP(X,Y,ncomp=1,mode="regression",max.iter=500,tol=1e-06,keepX=c(4),keepY=c(4),ind.block.y=ind.block.y,ind.block.x=ind.block.x,alpha.x=0.05,alpha.y=0.95,upper.lambda=1000000000)
        #res.sparse <- s
        # added: record selected features in each set
        for(k in 1:ncomp){
          featuresX[[k]] = c(unlist(featuresX[[k]]), selectVar(spls.res, comp = k)$X$name)
          featuresY[[k]] = c(unlist(featuresY[[k]]), selectVar(spls.res, comp = k)$Y$name)
        }
        
        
        #if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
        Y.hat = predict.sgPLS(spls.res, X.test)$predict
        
        #compute prediction of Y
        for (h in 1:ncomp) {
          Ypred[omit, , h] = Y.hat[, , h]
          
          # KA: this bunch was added:
          # compute the press and the RSS
          # by definition (tenenhaus), RSS[h+1,] = (y_i - y.hat_(h-1)_i)^2
          press.mat[omit, , h] = (Y.test - Y.hat[, , h])^2
          RSS.indiv[omit, ,h+1] = (Y.test - Y.hat[, , h])^2
        } # end h
      } #end i (cross validation)
      
      # KA added
      # warn the user that at least test set had a length of 1
      if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
      
      # these criteria are computed across all folds.
      for (h in 1:ncomp) { 
        MSEP[, h] = apply(as.matrix(press.mat[, , h]), 2, mean, na.rm = TRUE)
        R2[, h] = (diag(cor(Y, Ypred[,,h], use = "pairwise")))^2
        #KA:  PRESS is also computed as well as Q2 inside this procedure 
        if(q>1){
          RSS[h+1,] = t(apply(RSS.indiv[,,h+1], 2, sum))
          PRESS.inside[h, ] = colSums(press.mat[, , h], na.rm = TRUE)
        }else{
          RSS[h+1,q] = sum(RSS.indiv[,q,h+1])
          PRESS.inside[h, q] = sum(press.mat[,q, h], na.rm = TRUE)
        }
        
        Q2.inside[h, ] = 1 - PRESS.inside[h, ]/RSS[h, ]
        
      }
      
      colnames(MSEP) = colnames(R2) = rownames(Q2.inside) = paste('ncomp', c(1:ncomp), sep = " ")
      rownames(MSEP) = rownames(R2) = colnames(Q2.inside)  = colnames(Y)
      
      if (q == 1) rownames(MSEP) = rownames(R2) = ""
      
      if(ncomp>1){
        # compute Q2 total
        if(q>1){
          Q2.total = 1 - rowSums(PRESS.inside, na.rm = TRUE)/rowSums(RSS[-(ncomp+1), ], na.rm = TRUE)
        }else{ # for q == 1
          Q2.total = t(1 - PRESS.inside/RSS[-(ncomp+1), ])
        }
      }else{
        Q2.total = NA
      }
      
      names(Q2.total) = paste('comp', 1:ncomp, sep = " ")
      
    } # end all, MSEP, R2
    
    if (progressBar == TRUE) cat('\n')
    
    
    # ---- extract stability of features ----- # NEW
    list.featuresX = list.featuresY =list()
    for(k in 1:ncomp){
      #remove the NA value that was added for initialisation
      remove.naX = which(is.na(featuresX[[k]]))
      remove.naY = which(is.na(featuresY[[k]]))
      # then summarise as a factor and output the percentage of appearance
      list.featuresX[[k]] = sort(summary(as.factor(featuresX[[k]][-remove.naX]))/M, decreasing = TRUE)
      list.featuresY[[k]] = sort(summary(as.factor(featuresY[[k]][-remove.naY]))/M, decreasing = TRUE)
    }
    
    # extract features selected from the full model ---------
    features.finalX = features.finalY =list()
    for(k in 1:ncomp){
      features.finalX[[k]] = selectVar(object, comp = k)$X$value
      features.finalY[[k]] = selectVar(object, comp = k)$Y$value
    }
    
    names(features.finalX)  = names(features.finalY) = names(list.featuresX) = names(list.featuresX) = paste('comp', 1:ncomp)
    
    
    # ----------  final outputs:
    if (any(criterion %in% c("all", "MSEP"))) res$MSEP = MSEP
    if (any(criterion %in% c("all", "R2"))) res$R2 = R2
    if (any(criterion %in% c("all", "Q2"))) res$Q2 = t(Q2.inside)
    if (any(criterion %in% c("all", "Q2"))) res$Q2.total = Q2.total
    
    
    #features
    res$features$stable.X = list.featuresX
    res$features$stable.Y = list.featuresY
    res$features$final.X = features.finalX
    res$features$final.Y = features.finalY
    
    res$press.mat=press.mat
    res$RSS.indiv=RSS.indiv
    res$PRESS.inside=PRESS.inside
    res$RSS=RSS
    
    method = "pls.mthd"
    class(res) = c("perf", method)
    return(invisible(res))
  }

# ---------------------------------------------------
# perf for splsda object
# ---------------------------------------------------
perf.sPLSda <- function(object,                                               
                        method.predict = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),     
                        validation = c("Mfold", "loo"),                                          
                        folds = 10,                                                              
                        progressBar = TRUE,...)                                                        
{
  
  #-- initialising arguments --#
  # these data are the centered and scaled X output or the unmapped(Y) scaled and centered
  X = object$X
  level.Y = object$names$Y  #to make sure the levels are ordered
  Y = object$ind.mat
  Y = map(Y)
  Y = factor(Y,labels = level.Y)
  ncomp = object$ncomp
  n = nrow(X)
  keepX = object$keepX  
  
  tol = object$tol
  max.iter = object$max.iter
  
  # initialize new objects:
  features <- list()
  for(k in 1:ncomp){
    features[[k]] = NA
  }
  
  method.predict = match.arg(method.predict, choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), several.ok = TRUE)
  if (any(method.predict == "all")) nmthdd = 3 
  else nmthdd = length(method.predict)  
  
  
 
  
  
  error.fun = function(x, y) {
    error.vec = sweep(x, 1, y, FUN = "-")
    error.vec = (error.vec != 0)
    error.vec = apply(error.vec, 2, sum) / length(y)
    return(error.vec)
  }
  
  #-- define the folds --#
  if (validation == "Mfold") {
    if (is.list(folds)) {
      if (length(folds) < 2 | length(folds) > n)
        stop("Invalid number of folds.")
      if (length(unique(unlist(folds))) != n)
        stop("Invalid folds.")
      
      M = length(folds)
    }
    else {
      if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
        stop("Invalid number of folds.")
      else {
        M = round(folds)
        folds = split(sample(1:n), rep(1:M, length = n)) 
      }
    }
  } 
  else { 
    folds = split(1:n, rep(1:n, length = n)) 
    M = n
  }
  
  
  error.mat = array(0, dim = c(ncomp, nmthdd, M))
  
  # in case the test set only includes one sample, it is better to advise the user to perform loocv
  stop.user = FALSE
  # set up a progress bar
  if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
  
  for (i in 1:M) {
    if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
    
    #set up leave out samples.
    omit = folds[[i]]
    
    # see below, we stop the user if there is only one sample drawn on the test set using MFold
    if(length(omit) == 1) stop.user = TRUE
    
    # the training set is NOT scaled
    X.train = X[-omit, ]
    Y.train = Y[-omit]
    X.test = matrix(X[omit, ], nrow = length(omit))
    
    spls.res = sPLSda(X.train, Y.train, ncomp, max.iter = max.iter , tol = tol, keepX=keepX)  
    # added: record selected features
    for(k in 1:ncomp){
      features[[k]] = c(unlist(features[[k]]), selectVar(spls.res, comp = k)$name)
    }
    
    if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
    Y.predict = predict(spls.res, X.test, method = method.predict)$class
    error.mat[, , i] = sapply(Y.predict, error.fun, y = as.numeric(Y[omit]))
    
    
  } # end loop on i
  
  # warn the user that at least test set had a length of 1
  if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
  
  if (progressBar == TRUE) cat('\n')
  
  #-- compute the error --#
  res = apply(error.mat, 1:2, mean)
  
  rownames(res) = paste('ncomp', 1:ncomp, sep = " ")
  colnames(res) = names(Y.predict)
  
  # ---- extract stability of features ----- # NEW
  list.features = list()
  for(k in 1:ncomp){
    #remove the NA value that was added for initialisation
    remove.na = which(is.na(features[[k]]))
    # then summarise as a factor and output the percentage of appearance
    list.features[[k]] = sort(table(as.factor(features[[k]][-remove.na]))/M, decreasing = TRUE)
  }
  

  
  names(list.features) = paste('comp', 1:ncomp)
  
  result = list()
  result$error.rate = res
  result$features$stable = list.features
  
  # added
 # result$nzvX = nzv$Position
  
  
  method = "plsda.mthd"
  result$meth = "splsda.mthd"
  class(result) = c("perf", method)
  #updated outputs
  return(invisible(result))
}


# ---------------------------------------------------
# perf for sgplsda object
# ---------------------------------------------------
perf.sgPLSda <- function(object,                                               
                        method.predict = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),     
                        validation = c("Mfold", "loo"),                                          
                        folds = 10,                                                              
                        progressBar = TRUE,...)                                                        
{
  
  #-- initialising arguments --#
  # these data are the centered and scaled X output or the unmapped(Y) scaled and centered
  X = object$X
  level.Y = object$names$Y  #to make sure the levels are ordered
  Y = object$ind.mat
  Y = map(Y)
  Y = factor(Y,labels = level.Y)
  ncomp = object$ncomp
  n = nrow(X)
  keepX = object$keepX  
  
  # tells which variables are selected in X and in Y:
    
    
    ind.block.x = object$ind.block.x
    ind.block.y = object$ind.block.y
    alpha.x = object$alpha.x
    alpha.y = object$alpha.y
    upper.lambda = object$upper.lambda
  
  
  
  tol = object$tol
  max.iter = object$max.iter
  
  # initialize new objects:
  features <- list()
  for(k in 1:ncomp){
    features[[k]] = NA
  }
  
  method.predict = match.arg(method.predict, choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), several.ok = TRUE)
  if (any(method.predict == "all")) nmthdd = 3 
  else nmthdd = length(method.predict)  
  
  
   
  
  error.fun = function(x, y) {
    error.vec = sweep(x, 1, y, FUN = "-")
    error.vec = (error.vec != 0)
    error.vec = apply(error.vec, 2, sum) / length(y)
    return(error.vec)
  }
  
  #-- define the folds --#
  if (validation == "Mfold") {
    if (is.list(folds)) {
      if (length(folds) < 2 | length(folds) > n)
        stop("Invalid number of folds.")
      if (length(unique(unlist(folds))) != n)
        stop("Invalid folds.")
      
      M = length(folds)
    }
    else {
      if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
        stop("Invalid number of folds.")
      else {
        M = round(folds)
        folds = split(sample(1:n), rep(1:M, length = n)) 
      }
    }
  } 
  else { 
    folds = split(1:n, rep(1:n, length = n)) 
    M = n
  }
  
  
  error.mat = array(0, dim = c(ncomp, nmthdd, M))
  
  # in case the test set only includes one sample, it is better to advise the user to perform loocv
  stop.user = FALSE
  # set up a progress bar
  if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
  
  for (i in 1:M) {
    if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
    
    #set up leave out samples.
    omit = folds[[i]]
    
    # see below, we stop the user if there is only one sample drawn on the test set using MFold
    if(length(omit) == 1) stop.user = TRUE
    
    # the training set is NOT scaled
    X.train = X[-omit, ]
    Y.train = Y[-omit]
    X.test = matrix(X[omit, ], nrow = length(omit))
     
    spls.res = sgPLSda(X = X.train,Y = Y.train, ncomp = ncomp , max.iter = max.iter, tol = tol, keepX=keepX, ind.block.x=ind.block.x, alpha.x = alpha.x, upper.lambda = upper.lambda)     ## change
     
    
    
    
    # added: record selected features
    for(k in 1:ncomp){
      features[[k]] = c(unlist(features[[k]]), selectVar(spls.res, comp = k)$name)
    }
    
    if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
    Y.predict = predict(spls.res, X.test, method = method.predict)$class
    error.mat[, , i] = sapply(Y.predict, error.fun, y = as.numeric(Y[omit]))
    
    
  } # end loop on i
  
  # warn the user that at least test set had a length of 1
  if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
  
  if (progressBar == TRUE) cat('\n')
  
  #-- compute the error --#
  res = apply(error.mat, 1:2, mean)
  
  rownames(res) = paste('ncomp', 1:ncomp, sep = " ")
  colnames(res) = names(Y.predict)
  
  # ---- extract stability of features ----- # NEW
  list.features = list()
  for(k in 1:ncomp){
    #remove the NA value that was added for initialisation
    remove.na = which(is.na(features[[k]]))
    # then summarise as a factor and output the percentage of appearance
    list.features[[k]] = sort(table(as.factor(features[[k]][-remove.na]))/M, decreasing = TRUE)
  }
  

  
  names(list.features) = paste('comp', 1:ncomp)
  
  result = list()
  result$error.rate = res
  result$features$stable = list.features
  
  # added
 # X = nzv$Position
  
  
  method = "plsda.mthd"
  result$meth = "splsda.mthd"
  class(result) = c("perf", method)
  #updated outputs
  return(invisible(result))
}

# ---------------------------------------------------
# perf for sgplsda object
# ---------------------------------------------------
perf.gPLSda <- function(object,                                               
                        method.predict = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"),     
                        validation = c("Mfold", "loo"),                                          
                        folds = 10,                                                              
                        progressBar = TRUE,...)                                                        
{
  
  #-- initialising arguments --#
  # these data are the centered and scaled X output or the unmapped(Y) scaled and centered
  X = object$X
  level.Y = object$names$Y  #to make sure the levels are ordered
  Y = object$ind.mat
  Y = map(Y)
  Y = factor(Y,labels = level.Y)
  ncomp = object$ncomp
  n = nrow(X)
  keepX = object$keepX  
  
  # tells which variables are selected in X and in Y:
    
    
    ind.block.x = object$ind.block.x
    ind.block.y = object$ind.block.y
    
  
  
  
  tol = object$tol
  max.iter = object$max.iter
  
  # initialize new objects:
  features <- list()
  for(k in 1:ncomp){
    features[[k]] = NA
  }
  
  method.predict = match.arg(method.predict, choices = c("all", "max.dist", "centroids.dist", "mahalanobis.dist"), several.ok = TRUE)
  if (any(method.predict == "all")) nmthdd = 3 
  else nmthdd = length(method.predict)  
  
  
   
  
  error.fun = function(x, y) {
    error.vec = sweep(x, 1, y, FUN = "-")
    error.vec = (error.vec != 0)
    error.vec = apply(error.vec, 2, sum) / length(y)
    return(error.vec)
  }
  
  #-- define the folds --#
  if (validation == "Mfold") {
    if (is.list(folds)) {
      if (length(folds) < 2 | length(folds) > n)
        stop("Invalid number of folds.")
      if (length(unique(unlist(folds))) != n)
        stop("Invalid folds.")
      
      M = length(folds)
    }
    else {
      if (is.null(folds) || !is.numeric(folds) || folds < 2 || folds > n)
        stop("Invalid number of folds.")
      else {
        M = round(folds)
        folds = split(sample(1:n), rep(1:M, length = n)) 
      }
    }
  } 
  else { 
    folds = split(1:n, rep(1:n, length = n)) 
    M = n
  }
  
  
  error.mat = array(0, dim = c(ncomp, nmthdd, M))
  
  # in case the test set only includes one sample, it is better to advise the user to perform loocv
  stop.user = FALSE
  # set up a progress bar
  if (progressBar == TRUE) pb <- txtProgressBar(style = 3)
  
  for (i in 1:M) {
    if (progressBar == TRUE) setTxtProgressBar(pb, i/M)
    
    #set up leave out samples.
    omit = folds[[i]]
    
    # see below, we stop the user if there is only one sample drawn on the test set using MFold
    if(length(omit) == 1) stop.user = TRUE
    
    # the training set is NOT scaled
    X.train = X[-omit, ]
    Y.train = Y[-omit]
    X.test = matrix(X[omit, ], nrow = length(omit))
     
    spls.res = gPLSda(X = X.train,Y = Y.train, ncomp = ncomp , max.iter = max.iter, tol = tol, keepX=keepX, ind.block.x=ind.block.x)     ## change
     
    
    
    
    # added: record selected features
    for(k in 1:ncomp){
      features[[k]] = c(unlist(features[[k]]), selectVar(spls.res, comp = k)$name)
    }
    
    if (!is.null(spls.res$nzv$Position)) X.test = X.test[, -spls.res$nzv$Position]
    Y.predict = predict(spls.res, X.test, method = method.predict)$class
    error.mat[, , i] = sapply(Y.predict, error.fun, y = as.numeric(Y[omit]))
    
    
  } # end loop on i
  
  # warn the user that at least test set had a length of 1
  if(stop.user == TRUE & validation == 'Mfold') stop('The folds value was set too high to perform cross validation. Choose validation = "loo" or set folds to a lower value')
  
  if (progressBar == TRUE) cat('\n')
  
  #-- compute the error --#
  res = apply(error.mat, 1:2, mean)
  
  rownames(res) = paste('ncomp', 1:ncomp, sep = " ")
  colnames(res) = names(Y.predict)
  
  # ---- extract stability of features ----- # NEW
  list.features = list()
  for(k in 1:ncomp){
    #remove the NA value that was added for initialisation
    remove.na = which(is.na(features[[k]]))
    # then summarise as a factor and output the percentage of appearance
    list.features[[k]] = sort(table(as.factor(features[[k]][-remove.na]))/M, decreasing = TRUE)
  }
  

  
  names(list.features) = paste('comp', 1:ncomp)
  
  result = list()
  result$error.rate = res
  result$features$stable = list.features
  
  # added
  #result$nzvX = nzv$Position
  
  
  method = "plsda.mthd"
  result$meth = "splsda.mthd"
  class(result) = c("perf", method)
  #updated outputs
  return(invisible(result))
}

Try the sgPLS package in your browser

Any scripts or data that you put into this service are public.

sgPLS documentation built on Oct. 5, 2023, 5:06 p.m.