R/cv_sparseSCA.R

#'A K-fold cross-validation procedure when common/distinctive processes are unknown with Lasso and Group Lasso penalties.
#'
#'\code{cv_sparseSCA} helps to find a range of Lasso and Group Lasso tuning parameters for the common component so as to generate sparse common component.
#'
#'This function searches through a range of Lasso and Group Lasso tuning parameters for identifying common and distinctive components
#'
#'@param DATA The concatenated data block, with rows representing subjects.
#'@param Jk A vector. Each element of this vector is the number of columns of a data block.
#'@param R The number of components (R>=2).
#'@param MaxIter Maximum number of iterations for this algorithm. The default value is 400.
#'@param NRSTARTS The number of multistarts for this algorithm. The default value is 1.
#'@param LassoSequence The range of Lasso tuning parameters. The default value is a sequence of 20 numbers from 0.00000001
#'to the smallest Lasso tuning parameter value that makes all the component loadings equal to zero. Note that by default the 20 numbers are equally spaced on the log scale. 
#'@param GLassoSequence The range of Group Lasso tuning parameters. The default value is a sequence of 20 numbers from 0.00000001
#'to the smallest Group Lasso tuning parameter value that makes all the component loadings equal to zero. Note that by default the 20 numbers are equally spaced (but not on the log scale). 
#'Note that if \code{LassoSequence} contains only one number, then by default \code{GLassoSequence} is a sequence of 50 values.
#'@param nfolds Number of folds. If missing, then 10 fold cross-validation will be performed.
#'@param method "datablock" or "component". These are two options with respect to the grouping of the loadings as used in the Group Lasso penalty. 
#'If \code{method="component"}, the block-grouping of the coefficients is applied per component separately. If \code{method = "datablock"}, the grouping
#'is applied on the concatenated data block, with loadings of all components together. If \code{method} is missing, then the "component" method is used 
#'by default.  
#'
#'@return
#'\item{MSPE}{A matrix of mean squared predition error (MSPE) for the sequences of Lasso and Group Lasso tuning parameters.}
#'\item{SE_MSE}{A matrix of standard errors for \code{MSPE}.}
#'\item{MSPE1SE}{The lowest MSPE + 1SE.}
#'\item{VarSelected}{A matrix of number of variables selected for the sequences of Lasso and Group Lasso tuning parameters.}
#'\item{Lasso_values}{The sequence of Lasso tuning parameters used for cross-validation. Users may also consult \code{Lambdaregion} (explained below).}
#'\item{Glasso_values}{The sequence of Group Lasso tuning parameters used for cross-validation. For example, suppose from the plot we found that the index number
#'for Group Lasso is \code{6}, its corresponding Group Lasso tuning parameter is \code{Glasso_values[6]}.}#'\item{Lambdaregion}{A region of proper tuning parameter values for Lasso, given a certain value for Group Lasso. This means that, for example, if 5 Group Lasso tuning parameter values have been considered, \code{Lambdaregion} is a 5 by 2 matrix.}
#'\item{RecommendedLambda}{A pair (or sometimes a few pairs) of Lasso and Group Lasso tuning parameters that lead to a model with MSPE closest to the lowest MSPE + 1SE.}
#'\item{P_hat}{Estimated component loading matrix, given the recommended tuning parameters.}
#'\item{T_hat}{Estimated component score matrix, given the recommended tuning parameters.}
#'\item{plotlog}{An index number for function \code{plot}, which is not useful for users.}
#'@examples
#'\dontrun{
#'DATA1 <- matrix(rnorm(50), nrow=5)
#'DATA2 <- matrix(rnorm(100), nrow=5)  
#'DATA <- cbind(DATA1, DATA2)
#'Jk <- c(10, 20) 
#'cv_sparseSCA(DATA, Jk, R=5, MaxIter = 100, NRSTARTS = 40, nfolds=10)
#'}
#'@references Witten, D.M., Tibshirani, R., & Hastie, T. (2009), A penalized matrix decomposition, with applications to sparse principal components and canonical correlation analysis. \emph{Biostatistics}, \emph{10}(3), 515-534.
#'@references
#'Friedman, J., Hastie, T., & Tibshirani, R. (2010). A note on the group lasso and a sparse group lasso. arXiv preprint arXiv:1001.0736.
#'@references
#'Yuan, M., & Lin, Y. (2006). Model selection and estimation in regression with grouped variables. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 68(1), 49-67.
#'@export
cv_sparseSCA <- function(DATA, Jk, R, MaxIter, NRSTARTS, LassoSequence, GLassoSequence, nfolds, method){

  DATA <- data.matrix(DATA)
  plotlog <- 0

  if(R == 1){
    stop("Parameter R = 1 is not allowed.")
  }
  
  if(missing(LassoSequence) | missing(GLassoSequence)){

      results <- maxLGlasso(DATA, Jk, R)
      GLassomax <- results$Glasso
      Lassomax <- results$Lasso

      
    if(missing(LassoSequence) & missing(GLassoSequence)){
      LassoSequence <- exp(seq(from = log(0.00000001), to = log(Lassomax), length.out = 20))
      GLassoSequence <- seq(from = 0.00000001, to = GLassomax, length.out = 20)  #note that Glasso is not on the log scale, because it is not helpful.
      plotlog <- 1
    } else if(missing(LassoSequence) & (length(GLassoSequence) == 1)){
        LassoSequence <- exp(seq(from = log(0.00000001), to = log(Lassomax), length.out = 50))
        plotlog <- 1
    } else if(missing(GLassoSequence) & (length(LassoSequence) == 1)){
        GLassoSequence <- seq(from = 0.00000001, to = GLassomax, length.out = 50)
    } else if(missing(GLassoSequence) & (length(LassoSequence)>1)){
        GLassoSequence <- seq(from = 0.00000001, to = GLassomax, length.out = 20)
    } else if(missing(LassoSequence) & (length(GLassoSequence)>1)){
        LassoSequence <- exp(seq(from = log(0.00000001), to = log(Lassomax), length.out = 20))
        plotlog <- 1
    }
    
    
  }

  if(length(Jk) <= 1){
    stop("This is an algorithm for data integration! There should be at least 2 data blocks!")
  }
  
  if (min(GLassoSequence) < 0) {
    stop("Group Lasso tuning parameter must be non-negative!")
  }
  
  if (min(LassoSequence) < 0) {
    stop("Lasso tuning parameter must be non-negative!")
  }
  if(missing(MaxIter)){
    MaxIter <- 400
  }

  if(missing(NRSTARTS)){
    NRSTARTS <- 1
  }

  if(missing(nfolds)){
    nfolds <- 10
  }
  if (nfolds < 2){
    stop("Must be at least 2 folds!")
  }
  
  if(missing(method)){
    method <- "component"
  }
  
  
  PRESS <- matrix(0, length(LassoSequence), length(GLassoSequence))
  se_MSE <- matrix(0, length(LassoSequence), length(GLassoSequence))
  varselected <- matrix(0, length(LassoSequence), length(GLassoSequence))
  percentRemove <- 1/nfolds

  ii <- 0
  while(ii != 1){ #this procedure is to make sure that the training sample do not have an entire row/column of NA's

    randmat <- matrix(stats::runif(nrow(DATA) * ncol(DATA)), ncol = ncol(DATA))
    jj <- 0
    for (i in 1:nfolds){
      ToRemove <- ((i - 1) * percentRemove < randmat) & (randmat < i * percentRemove) # this idea is from PMA package
      DATArm <- DATA
      DATArm[ToRemove] <- NA

      if(ncol(DATArm) %in% rowSums(is.na(DATArm))){
        break
      } else if(nrow(DATArm) %in% colSums(is.na(DATArm))){
        break
      }else{
        jj <- jj+1
      }
    }

    if(jj == nfolds){
      ii <- 1
    }
  }

  
  for (g in 1: length(GLassoSequence)){

    for (l in 1:length(LassoSequence)){

      cat(sprintf("\nThe cross-validation procedure might take a while to finish. Please be patient."))
      cat(sprintf("\nGroup Lasso: %s", GLassoSequence[g]))
      cat(sprintf("\nLasso: %s", LassoSequence[l]))

      if(method == "datablock"){
        Forvarselected <- CDfriedmanV1(DATA, Jk, R, LassoSequence[l], GLassoSequence[g], MaxIter)
      }else if (method == "component"){
        Forvarselected <- CDfriedmanV2(DATA, Jk, R, LassoSequence[l], GLassoSequence[g], MaxIter)
      }
      varselected[l,g] <- sum(Forvarselected$Pmatrix != 0)  #how many variables in P have been selected?
      
      error_x <- array()
      for (i in 1:nfolds){
        ToRemove <- ((i - 1) * percentRemove < randmat) & (randmat < i * percentRemove) # this idea is from PMA package
        DATArm <- DATA
        DATArm[ToRemove] <- NA

        for(c in 1:ncol(DATA)){
          indexc <- !is.na(DATArm[, c])
          DATArm[, c][!indexc] <- mean(DATArm[, c][indexc]) #missing values are replaced by column means
        }

        Pout3d <- list()
        Tout3d <- list()
        LOSS <- array()
        for (n in 1:NRSTARTS){
          if(method == "datablock"){
            VarSelectResult <- CDfriedmanV1(DATArm, Jk, R, LassoSequence[l], GLassoSequence[g], MaxIter)
          }else if (method == "component"){
            VarSelectResult <- CDfriedmanV2(DATArm, Jk, R, LassoSequence[l], GLassoSequence[g], MaxIter)
          }
          Pout3d[[n]] <- VarSelectResult$Pmatrix
          Tout3d[[n]] <- VarSelectResult$Tmatrix
          LOSS[n] <- VarSelectResult$Loss
        }
        k <- which(LOSS == min(LOSS))
        if (length(k)>1){
          pos <- sample(1:length(k), 1)
          k <- k[pos]
        }
        PoutBest <- Pout3d[[k]]
        ToutBest <- Tout3d[[k]]

        DATA_hat <- ToutBest%*%t(PoutBest)
        error_x[i] <- sum((DATA[ToRemove] - DATA_hat[ToRemove])^2)

      }


      PRESS[l,g] <- sum(error_x)/nfolds
      se_MSE[l,g] <- stats::sd(error_x)/sqrt(nfolds)
    }
  }

  vec_PRESS <- c(PRESS)
  vec_se <- c(se_MSE)
  vec_varsel <- c(varselected)
  upper <- vec_PRESS + vec_se
  lower <- vec_PRESS - vec_se 
  #lasso_index0 <- rep(1:length(LassoSequence), length(GLassoSequence))
  #Glasso_index0 <- rep(1:length(GLassoSequence), each=length(LassoSequence))
  
  #lasso_index <- paste("L", lasso_index0)
  #Glasso_index<- factor(paste("G", Glasso_index0), levels=paste("G", 1:length(GLassoSequence)))
  

  
  if (length(LassoSequence)>=2 & length(GLassoSequence)>=2){ #### CASE1: multiple lasso and glasso
    
    lasso_index0 <- rep(LassoSequence, length(GLassoSequence))
    Glasso_index0 <- rep(1:length(GLassoSequence), each=length(LassoSequence))
    Glasso_index0 <- factor(paste("G", round(Glasso_index0, digits = 3)), levels=paste("G", 1:length(GLassoSequence)))
    
    lowestPress <- min(vec_PRESS)
    
    if(length(which(vec_PRESS == lowestPress))>1){
      #this happens when min(vec_PRESS) contains multiple values
      lowestplus1SE <- lowestPress + min(vec_se[which(vec_PRESS == lowestPress)]) 
    } else{
      lowestplus1SE <- lowestPress + vec_se[which(vec_PRESS == lowestPress)] #plot 1SE rule region 
    }
    lasso1 <- array()
    lasso2 <- array()
      for(i in 1:length(GLassoSequence)){
       
        pressindex <- which(abs(PRESS[, i]-lowestplus1SE)==min(abs(PRESS[, i]-lowestplus1SE)))  #comment: it seems that we have to have an index number here, instead of inserting the index directly in PRESS[index, i]
        pressindex <- pressindex[length(pressindex)]  #this is because in case of large Glasso values, preindex is a vector, we choose the one with the most sparse results
        lasso1temp <- LassoSequence[pressindex]
        
        if(PRESS[pressindex, i] - lowestplus1SE > 0 ){
          if(LassoSequence[pressindex] == LassoSequence[1]){
            lasso2temp <- lasso1temp  #otherwise lasso2 is out of the boundary
          } else{
            lasso2temp <- LassoSequence[pressindex - 1]
            if(PRESS[pressindex - 1, i]  - lowestplus1SE > 0){
              lasso2temp <- lasso1temp
            }
          }
          #the following condition concerns a rare case 
          
          lasso1[i] <- lasso2temp
          lasso2[i] <- lasso1temp
        }  else if (PRESS[pressindex, i] - lowestplus1SE < 0 ){
              if(LassoSequence[pressindex] == LassoSequence[length(LassoSequence)]){
                lasso2temp <- lasso1temp #otherwise lasso2 is out of the boundary
              } else{
                lasso2temp <- LassoSequence[pressindex + 1]
                #the following condition concerns a rare case 
                if(PRESS[pressindex + 1, i]  - lowestplus1SE < 0) {
                  lasso2temp <- lasso1temp
                }
              }
            
            lasso1[i] <- lasso1temp
            lasso2[i] <- lasso2temp
          
        } else { #this is when a PRESS value lies exactly on the 1SE dotted line 
      
          lasso1[i] <- lasso1temp
          lasso2[i] <- lasso1temp
        }
      }
 
    lambdaregion <- cbind(lasso1, lasso2)
    l1matrix <- t(cbind(lasso1, matrix(NA, length(lasso1), length(LassoSequence)-1)))
    l2matrix <- t(cbind(lasso2, matrix(NA, length(lasso2), length(LassoSequence)-1)))
  
    
  } else if(length(LassoSequence)>=2 & length(GLassoSequence)==1){ #### CASE2: multiple lasso, one glasso
    
    df <- data.frame(LassoI = LassoSequence, Press = vec_PRESS, Upper = upper, Lower = lower)
    
    lowestPress <- min(vec_PRESS)
    lowestplus1SE <- lowestPress + vec_se[which(vec_PRESS == lowestPress)] #plot 1SE rule region 
    lasso1 <- df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))]
    if(vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] - lowestplus1SE > 0 ){
      if(df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] == df$LassoI[1]){
        lasso2 <- lasso1 #otherwise lasso2 is out of the boundary
      } else{
        lasso2 <- df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) - 1]
      }
      #the following condition concerns a rare case 
      if((vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) - 1]  - lowestplus1SE > 0) &
          (df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] != df$LassoI[1])){
        lasso2 <- lasso1
        cat("\nWarning! The region for proper tuning parameter values is not available. A possible value is ", lasso1)
        lambdaregion <- lasso1
      } else{
        lambdaregion <- c(lasso2, lasso1)
      }
    } else if (vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] - lowestplus1SE < 0 ){
        if(df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] == df$LassoI[length(LassoSequence)]){
          lasso2 <- lasso1 #otherwise lasso2 is out of the boundary
        } else{
          lasso2 <- df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) + 1]
        }
      
      #the following condition concerns a rare case 
      if((vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) + 1]  - lowestplus1SE < 0) & 
          (df$LassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] != df$LassoI[length(LassoSequence)])){
        lasso2 <- lasso1
        cat("\nWarning! The region for proper tuning parameter values is not available. A possible value is ", lasso1)
        lambdaregion <- lasso1
      } else{
        lambdaregion <- c(lasso1, lasso2)
      }
      
    } else {#this is when a PRESS value lies exactly on the 1SE dotted line
      lasso2 <- lasso1
      lambdaregion <- lasso1
    }
    
    
  } else if(length(LassoSequence)==1 & length(GLassoSequence)>= 2){####CASE 3: Multiple glasso, one lasso
     
      df <- data.frame(GLassoI = GLassoSequence, Press = vec_PRESS, Upper = upper, Lower = lower)
    
      lowestPress <- min(vec_PRESS)
      lowestplus1SE <- lowestPress + vec_se[which(vec_PRESS == lowestPress)] #plot 1SE rule region 
      glasso1 <- df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))]
      if(vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] - lowestplus1SE > 0 ){
        if(df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] == df$GLassoI[1]){
          glasso2 <- glasso1  #otherwise Glasso2 is out of the boundary
        } else{
          glasso2 <- df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) - 1]
        }
        #the following condition concerns a rare case 
        if((vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) - 1]  - lowestplus1SE > 0) &
            (df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] != df$GLassoI[1])){
          glasso2 <- glasso1
          cat("\nWarning! The region for proper tuning parameter values is not available. A possible value is ", glasso1)
          lambdaregion <- glasso1
        } else{
          lambdaregion <- c(glasso2, glasso1)
        }
        
      } else if (vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] - lowestplus1SE < 0 ){
        if(df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] == df$GLassoI[length(GLassoSequence)]){
          glasso2 <- glasso1 #otherwise glasso2 is out of the boundary
        } else{
          glasso2 <- df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) + 1]
        }
        
        #the following condition concerns a rare case 
        if((vec_PRESS[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE))) + 1]  - lowestplus1SE < 0) & 
            (df$GLassoI[which(abs(vec_PRESS-lowestplus1SE)==min(abs(vec_PRESS-lowestplus1SE)))] != df$GLassoI[length(GLassoSequence)])){
          glasso2 <- glasso1
          cat("\nWarning! The region for proper tuning parameter values is not available. A possible value is ", glasso1)
          lambdaregion <- glasso1
        } else{
          lambdaregion <- c(glasso1, glasso2)
        }
        
      } else { #this is when a PRESS value lies exactly on the 1SE dotted line 
        glasso2 <- glasso1
        lambdaregion <- glasso1
      }
    
  }

  indexTuning <- which(abs(PRESS - lowestplus1SE)==min(abs(PRESS - lowestplus1SE)), arr.ind = T)
  bestTunning <- matrix(NA, dim(indexTuning)[1], 2)
  bestTunning[, 1] <- LassoSequence[indexTuning[1]]
  bestTunning[, 2] <- GLassoSequence[indexTuning[2]]
  
  colnames(bestTunning) <- c("Lasso", "Group Lasso")
  
  if(length(LassoSequence)>=2 & length(GLassoSequence)>=2){
    colnames(lambdaregion) <- c("lower bound", "upper bound")
  }
  
  
  
  ###### re-estimate the model with the recommended tuning parameters
  if(dim(bestTunning)[1] == 1){
    Re_est <- sparseSCA(DATA, Jk, R, LASSO = bestTunning[1, 1], GROUPLASSO = bestTunning[1, 2], MaxIter, NRSTARTS = 20, method)
      p_hat <- Re_est$Pmatrix
      t_hat <- Re_est$Tmatrix
    
  }else if(dim(bestTunning)[1] > 1){
    # in this case more than one pair of tuning parameters are recommended (although it's highly unlikely)
    p_hat <- list()
    t_hat <- list()
  
      for(j in 1:dim(bestTunning)[1]){
        Re_est <- sparseSCA(DATA, Jk, R, LASSO = bestTunning[j, 1], GROUPLASSO = bestTunning[j, 2], MaxIter, NRSTARTS = 20, method)
        
        p_hat[[j]] <- Re_est$Pmatrix
        t_hat[[j]] <- Re_est$Tmatrix
      }
  }
  
  
  return_crossvali <- list()
  return_crossvali$MSPE <- PRESS
  return_crossvali$SE_MSE <- se_MSE
  return_crossvali$MSPE1SE <- lowestplus1SE
  return_crossvali$VarSelected <- varselected
  return_crossvali$Lasso_values <- LassoSequence
  return_crossvali$Glasso_values <- GLassoSequence
  #return_crossvali$Lambdaregion <- lambdaregion  # lambdaregion will be removed in the next major update
  return_crossvali$RecommendedLambda <- bestTunning
  return_crossvali$P_hat <- p_hat
  return_crossvali$T_hat <- t_hat
  return_crossvali$plotlog <- plotlog
  attr(return_crossvali, "class") <- "CVsparseSCA"
  return(return_crossvali)

}
ZhengguoGu/RegularizedSCA documentation built on July 4, 2019, 2:46 p.m.