R/grouplasso.R

Defines functions grouplasso_gt_cv_adapt grouplasso_gt_cv_fixedgrid grouplasso_gt_grid grouplasso_gt grouplasso_logreg_cv_adapt grouplasso_logreg_cv_fixedgrid grouplasso_logreg_grid grouplasso_linreg_cv_adapt grouplasso_linreg_cv_fixedgrid grouplasso_linreg_grid

Documented in grouplasso_gt grouplasso_gt_cv_adapt grouplasso_gt_cv_fixedgrid grouplasso_gt_grid grouplasso_linreg_cv_adapt grouplasso_linreg_cv_fixedgrid grouplasso_linreg_grid grouplasso_logreg_cv_adapt grouplasso_logreg_cv_fixedgrid grouplasso_logreg_grid

#' Fit group lasso regression estimator for continuous resopnse over a grid of lambda values
#'
#' @param Y the response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values desired
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param w group-specific weights for different penalization of different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations
#' @param report.prog a logical indicating whether the progress of the algorithm should be printed to the console
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples
#' grouplasso_linreg_data <- get_grouplasso_data(n = 400, response = "continuous")
#' 
#' grouplasso_linreg_grid.out <- grouplasso_linreg_grid(Y = grouplasso_linreg_data$Y,
#'                                                      X = grouplasso_linreg_data$X,
#'                                                      groups = grouplasso_linreg_data$groups,
#'                                                      n.lambda = 25,
#'                                                      lambda.min.ratio = 0.001,
#'                                                      lambda.max.ratio = 0.1,
#'                                                      w = grouplasso_linreg_data$w,
#'                                                      tol = 1e-3,
#'                                                      maxiter = 500,
#'                                                      report.prog = TRUE)
#' @export
grouplasso_linreg_grid <- function(Y,X,groups,n.lambda,lambda.min.ratio,lambda.max.ratio=1,w,tol=1e-4,maxiter=500,report.prog=FALSE)
{

  # find lambda.max
  q <- length(unique(groups))
  n <- nrow(X)

  norms <- numeric(q)
  for(j in 2:q)
  {

    ind <- which(groups == j)

    if(j <= q){

      norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y - mean(Y)))^2)) / w[j]

    }

  }

  lambda.max <- 2 * max(norms) # yes this is correct! This is the smallest value of lambda which sets all the non-intercept entries of beta equal to zero.

  # make a lambda sequence
  largest.lambda <- lambda.max.ratio * lambda.max
  smallest.lambda <- lambda.min.ratio * lambda.max
  lambda.seq <- sort(c(exp(log(smallest.lambda) + ((n.lambda):1)/(n.lambda) * ((log(largest.lambda) - log(smallest.lambda))))))
  
  if(n.lambda == 1) lambda.seq <- lambda.min

  # fit over the values in the lambda sequence using warm starts
  b.mat <- matrix(0,ncol(X),n.lambda)
  iterations <- matrix(0,n.lambda,2)
  colnames(iterations) <- c("lambda","iter")
  step <- 0
  init <- rep(0,ncol(X))
  for(l in 1:n.lambda){

      grouplasso_linreg.out <- grouplasso_linreg(rY = Y,
                                                 rX = X,
                                                 groups = groups,
                                                 lambda = lambda.seq[l],
                                                 w = w,
                                                 tol = tol,
                                                 maxiter = maxiter,
                                                 beta_init = init)

      b <- grouplasso_linreg.out$beta.hat

      init <- b
      b.mat[,l] <- b

      step <- step + 1
      iterations[step,] <- c(lambda.seq[l],grouplasso_linreg.out$iter)

      if(report.prog == TRUE){

        print(c(l,grouplasso_linreg.out$iter))

      }

  }

  output <- list( b.mat = b.mat,
                  lambda.seq = lambda.seq,
                  iterations = iterations)

  return(output)

}

#' Choose tuning parameters by crossvalidation for grouplasso linreg when given a fixed grid of lambda values
#'
#' @param Y the response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param lambda.seq sequence of lambda values
#' @param n.folds the number of crossvalidation folds
#' @param b.init.arr array of initial values for beta
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization of different groups
#' @param tol the convergence tolerance
#' @param maxiter the maximum number of iterations allowed for each fit
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples
#' grouplasso_linreg_data <- get_grouplasso_data(n = 400, response = "continuous")
#' 
#' grouplasso_linreg_grid.out <- grouplasso_linreg_grid(Y = grouplasso_linreg_data$Y,
#'                                                      X = grouplasso_linreg_data$X,
#'                                                      groups = grouplasso_linreg_data$groups,
#'                                                      n.lambda = 25,
#'                                                      lambda.min.ratio = 0.001,
#'                                                      lambda.max.ratio = 0.1,
#'                                                      w = grouplasso_linreg_data$w,
#'                                                      tol = 1e-3,
#'                                                      maxiter = 500,
#'                                                      report.prog = FALSE)
#' 
#' grouplasso_linreg_cv_fixedgrid.out <- grouplasso_linreg_cv_fixedgrid(Y = grouplasso_linreg_data$Y,
#'                                                                      X = grouplasso_linreg_data$X,
#'                                                                      groups = grouplasso_linreg_data$groups,
#'                                                                      lambda.seq = grouplasso_linreg_grid.out$lambda.seq,
#'                                                                      n.folds = 5,
#'                                                                      b.init.mat = grouplasso_linreg_grid.out$b.mat,
#'                                                                      w = grouplasso_linreg_data$w,
#'                                                                      tol = 1e-3,
#'                                                                      maxiter = 500)
#' @export
grouplasso_linreg_cv_fixedgrid <- function(Y,X,groups,lambda.seq,n.folds,b.init.mat,w,tol=1e-3,maxiter=500)
{

  # create list of sets of indices indicating which observations are in each fold
  n <- nrow(X)

  folds <- vector("list",n.folds)
  fold.size <- floor(n / n.folds)
  for(fold in 1:n.folds){

    folds[[fold]] <- ((fold-1)*fold.size + 1):(fold*fold.size)

  }

  if( floor(n / n.folds) != n / n.folds ){

    folds[[n.folds]] <- c(folds[[n.folds]],(fold*fold.size+1):n)

  }

  # get fits at all lambda values on all cv folds
  n.lambda <- length(lambda.seq)
  b.folds.arr <- array(0,dim=c(ncol(X),n.lambda,n.folds))

  minus2ll.mat <- matrix(0,n.lambda,n.folds)

  iterations <- matrix(0,n.lambda,1+n.folds)
  colnames(iterations) <- c("lambda",paste("fold",1:n.folds,"iter"))
  step <- 1

  for(l in 1:n.lambda){

      iterations[step,1] <- lambda.seq[l]

      for(fold in 1:n.folds){

        fold.ind <- folds[[fold]]

        grouplasso_linreg.out <- grouplasso_linreg(rY = Y[-fold.ind],
                                                   rX = X[-fold.ind,],
                                                   groups = groups,
                                                   lambda = lambda.seq[l]*(n.folds - 1)/n.folds,
                                                   w = w,
                                                   tol = tol,
                                                   maxiter = maxiter,
                                                   beta_init = b.init.mat[,l])

        b.fold <- grouplasso_linreg.out$beta.hat
        b.folds.arr[,l,fold] <- b.fold

        minus2ll.fold <- sum( (Y[fold.ind] - X[fold.ind,] %*% b.fold )^2 )
        minus2ll.mat[l,fold] <- minus2ll.fold

        iterations[step,1+fold] <- grouplasso_linreg.out$iter

      }

      step <- step + 1

  }

  meanCVll <- apply(minus2ll.mat,1,mean)
  seCVll <- apply(minus2ll.mat,1,sd)/sqrt(n.folds)
  which.lambda.cv <- which.min(meanCVll)

  plus1se.thresh <- meanCVll[which.lambda.cv] + seCVll[which.lambda.cv]
  which.lambda.cv.1se <- max(which(meanCVll <= plus1se.thresh))
  
  output <- list( b.folds.arr = b.folds.arr,
                  minus2ll.mat = minus2ll.mat,
                  which.lambda.cv = which.lambda.cv,
                  which.lambda.cv.1se = which.lambda.cv.1se,
                  lambda.seq = lambda.seq,
                  iterations = iterations)

  return(output)

}



#' Choose tuning parameters by crossvalidation for grouplasso linreg with adaptive weights
#'
#' @param Y the response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values desired
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization toward similarity for different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations
#' @param report.prog a logical indicating whether the progress of the algorithm should be printed to the console
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#'
#' @examples
#' grouplasso_linreg_data <- get_grouplasso_data(n = 400, response = "continuous")
#' 
#' grouplasso_linreg_cv_adapt.out <- grouplasso_linreg_cv_adapt(Y = grouplasso_linreg_data$Y,
#'                                                              X = grouplasso_linreg_data$X,
#'                                                              groups = grouplasso_linreg_data$groups,
#'                                                              n.lambda = 25,
#'                                                              lambda.min.ratio = 0.001,
#'                                                              lambda.max.ratio = 0.1,
#'                                                              n.folds = 5,
#'                                                              w = grouplasso_linreg_data$w,
#'                                                              tol = 1e-3,
#'                                                              maxiter = 500,
#'                                                              report.prog = FALSE)
#' @export
grouplasso_linreg_cv_adapt <- function(Y,X,groups,n.lambda,lambda.min.ratio,lambda.max.ratio=1,n.folds,w,tol=1e-3,maxiter=500,report.prog = TRUE){


  # find lambda.max
  q <- length(unique(groups))
  n <- nrow(X)

  norms <- numeric(q)
  for(j in 2:q){

    ind <- which(groups == j)

    if(j <= q){

      norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y - mean(Y)))^2)) / w[j]

    }

  }

  lambda.max <- 2 * max(norms) # yes this is correct! This is the smallest value of lambda which sets all the non-intercept entries of beta equal to zero.
  lambda.initial.fit <- lambda.min.ratio * lambda.max

  # fit a grouplasso with lambda as lambda.min.ratio * lambda.max.
  grouplasso_linreg.out <- grouplasso_linreg(rY = Y,
                                             rX = X,
                                             groups = groups,
                                             lambda = lambda.initial.fit,
                                             w = w,
                                             tol = tol,
                                             maxiter = maxiter)


  # now make a new value of w based on this initial fit
  for(j in 1:q){

    ind <- which(groups == j)
    w[j] <- min(w[j]/sqrt( sum( grouplasso_linreg.out$beta.hat[ind]^2 )),1e10) # replace Inf with 1e10

  }

  # obtain lambda.seq from the grid function, as well as the fits on the entire data set, which will be used as initial values for the crossvalidation training fits.
  grouplasso_linreg_grid.out <- grouplasso_linreg_grid(Y = Y,
                                                       X = X,
                                                       groups = groups,
                                                       n.lambda = n.lambda,
                                                       lambda.min.ratio = lambda.min.ratio,
                                                       lambda.max.ratio = lambda.max.ratio,
                                                       w = w,
                                                       tol = tol,
                                                       maxiter = maxiter,
                                                       report.prog = report.prog)

  lambda.seq <- grouplasso_linreg_grid.out$lambda.seq
  b.mat <- grouplasso_linreg_grid.out$b.mat

  # do the crossvalidation
  grouplasso_linreg_cv_fixedgrid.out <- grouplasso_linreg_cv_fixedgrid(Y = Y,
                                                                       X = X,
                                                                       groups = groups,
                                                                       lambda.seq = lambda.seq,
                                                                       n.folds = n.folds,
                                                                       b.init.mat = b.mat,
                                                                       w = w,
                                                                       tol = tol,
                                                                       maxiter = maxiter)

  output <- list( b.mat = b.mat,
                  b.folds.arr = grouplasso_linreg_cv_fixedgrid.out$b.folds.arr,
                  minus2ll.mat = grouplasso_linreg_cv_fixedgrid.out$minus2ll.mat,
                  which.lambda.cv = grouplasso_linreg_cv_fixedgrid.out$which.lambda.cv,
                  which.lambda.cv.1se = grouplasso_linreg_cv_fixedgrid.out$which.lambda.cv.1se,###THIS
                  lambda.seq = lambda.seq,
                  lambda.initial.fit = lambda.initial.fit,
                  w = w,
                  iterations = grouplasso_linreg_cv_fixedgrid.out$iterations)

  return(output)

}





#' Fit grouplasso logistic regression estimator over a grid of lambda values
#'
#' @param Y the binary response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values desired
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param w group-specific weights for different penalization of different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations
#' @param report.prog a logical indicating whether the progress of the algorithm should be printed to the console
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples 
#' grouplasso_logreg_data <- get_grouplasso_data(n = 400, response = "binary")
#' 
#' grouplasso_logreg_grid.out <- grouplasso_logreg_grid(Y = grouplasso_logreg_data$Y,
#'                                                      X = grouplasso_logreg_data$X,
#'                                                      groups = grouplasso_logreg_data$groups,
#'                                                      n.lambda = 25,
#'                                                      lambda.min.ratio = 0.001,
#'                                                      w = grouplasso_logreg_data$w,
#'                                                      tol = 1e-3,
#'                                                      maxiter = 500,
#'                                                      report.prog = TRUE)
#' @export
grouplasso_logreg_grid <- function(Y,X,groups,n.lambda,lambda.min.ratio,lambda.max.ratio=1,w,tol=1e-4,maxiter=500,report.prog=FALSE)
{
  
  # find lambda.max
  q <- length(unique(groups))
  n <- nrow(X)
  
  norms <- numeric(q)
  for(j in 2:q)
  {
    
    ind <- which(groups == j)
    
    if(j <= q){
      
      norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y - mean(Y)))^2)) / w[j]
      
    }
    
  }
  
  lambda.max <- 2 * max(norms) # yes this is correct! This is the smallest value of lambda which sets all the non-intercept entries of beta equal to zero.
  
  # make a lambda sequence
  largest.lambda <- lambda.max.ratio * lambda.max
  smallest.lambda <- lambda.min.ratio * lambda.max
  lambda.seq <- sort(c(exp(log(smallest.lambda) + ((n.lambda):1)/(n.lambda) * ((log(largest.lambda) - log(smallest.lambda))))))
  
  if(n.lambda == 1) lambda.seq <- lambda.min
  
  # fit over the values in the lambda sequence using warm starts
  b.mat <- matrix(0,ncol(X),n.lambda)
  iterations <- matrix(0,n.lambda,2)
  colnames(iterations) <- c("lambda","iter")
  step <- 0
  init <- rep(0,ncol(X))
  for(l in 1:n.lambda){
    
    grouplasso_logreg.out <- grouplasso_logreg(rY = Y,
                                               rX = X,
                                               groups = groups,
                                               lambda = lambda.seq[l],
                                               w = w,
                                               tol = tol,
                                               maxiter = maxiter,
                                               beta_init = init)
    
    b <- grouplasso_logreg.out$beta.hat
    
    init <- b
    b.mat[,l] <- b
    
    step <- step + 1
    iterations[step,] <- c(lambda.seq[l],grouplasso_logreg.out$iter)
    
    if(report.prog == TRUE){
      
      print(c(l,grouplasso_logreg.out$iter))
      
    }
    
  }
  
  output <- list( b.mat = b.mat,
                  lambda.seq = lambda.seq,
                  iterations = iterations)
  
  return(output)
  
}


#' Choose tuning parameters by crossvalidation for grouplasso logreg when given a fixed grid of lambda values
#'
#' @param Y the binary response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param lambda.seq sequence of lambda values
#' @param n.folds the number of crossvalidation folds
#' @param b.init.arr array of initial values for beta
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization of different groups
#' @param tol the convergence tolerance
#' @param maxiter the maximum number of iterations allowed for each fit
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples
#' grouplasso_logreg_data <- get_grouplasso_data(n = 100, response = "binary")
#' 
#' grouplasso_logreg_grid.out <- grouplasso_logreg_grid(Y = grouplasso_logreg_data$Y,
#'                                                      X = grouplasso_logreg_data$X,
#'                                                      groups = grouplasso_logreg_data$groups,
#'                                                      n.lambda = 25,
#'                                                      lambda.min.ratio = 0.01,
#'                                                      w = grouplasso_logreg_data$w,
#'                                                      tol = 1e-3,
#'                                                      maxiter = 500,
#'                                                      report.prog = FALSE)
#' 
#' grouplasso_logreg_cv_fixedgrid.out <- grouplasso_logreg_cv_fixedgrid(Y = grouplasso_logreg_data$Y,
#'                                                                      X = grouplasso_logreg_data$X,
#'                                                                      groups = grouplasso_logreg_data$groups,
#'                                                                      lambda.seq = grouplasso_logreg_grid.out$lambda.seq,
#'                                                                      n.folds = 5,
#'                                                                      b.init.mat = grouplasso_logreg_grid.out$b.mat,
#'                                                                      w = grouplasso_logreg_data$w,
#'                                                                      tol = 1e-3,
#'                                                                      maxiter = 500)
#' @export
grouplasso_logreg_cv_fixedgrid <- function(Y,X,groups,lambda.seq,n.folds,b.init.mat,w,tol=1e-3,maxiter=500)
{
  
  # create list of sets of indices indicating which observations are in each fold
  n <- nrow(X)
  
  folds <- vector("list",n.folds)
  fold.size <- floor(n / n.folds)
  for(fold in 1:n.folds){
    
    folds[[fold]] <- ((fold-1)*fold.size + 1):(fold*fold.size)
    
  }
  
  if( floor(n / n.folds) != n / n.folds ){
    
    folds[[n.folds]] <- c(folds[[n.folds]],(fold*fold.size+1):n)
    
  }
  
  # get fits at all lambda values on all cv folds
  n.lambda <- length(lambda.seq)
  b.folds.arr <- array(0,dim=c(ncol(X),n.lambda,n.folds))
  
  minus2ll.mat <- matrix(0,n.lambda,n.folds)
  
  iterations <- matrix(0,n.lambda,1+n.folds)
  colnames(iterations) <- c("lambda",paste("fold",1:n.folds,"iter"))
  step <- 1
  
  for(l in 1:n.lambda){
    
    iterations[step,1] <- lambda.seq[l]
    
    for(fold in 1:n.folds){
      
      fold.ind <- folds[[fold]]
      
      grouplasso_logreg.out <- grouplasso_logreg(rY = Y[-fold.ind],
                                                 rX = X[-fold.ind,],
                                                 groups = groups,
                                                 lambda = lambda.seq[l]*(n.folds - 1)/n.folds,
                                                 w = w,
                                                 tol = tol,
                                                 maxiter = maxiter,
                                                 beta_init = b.init.mat[,l])
      
      b.fold <- grouplasso_logreg.out$beta.hat
      b.folds.arr[,l,fold] <- b.fold
      
      P.fold <- logit(X[fold.ind,] %*% b.fold)
      minus2ll.fold <- - 2 * sum( Y[fold.ind] * log(P.fold) + (1-Y[fold.ind]) * log( 1 - P.fold) )
      minus2ll.mat[l,fold] <- minus2ll.fold
      
      iterations[step,1+fold] <- grouplasso_logreg.out$iter
      
    }
    
    step <- step + 1
    
  }
  
  meanCVll <- apply(minus2ll.mat,1,mean)
  which.lambda.cv <- which.min(meanCVll)
  
  output <- list( b.folds.arr = b.folds.arr,
                  minus2ll.mat = minus2ll.mat,
                  which.lambda.cv = which.lambda.cv,
                  lambda.seq = lambda.seq,
                  iterations = iterations)
  
  return(output)
  
}



#' Choose tuning parameters by crossvalidation for grouplasso logreg with adaptive weights
#'
#' @param Y the binary response vector
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values desired
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization toward similarity for different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations
#' @param report.prog a logical indicating whether the progress of the algorithm should be printed to the console
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples
#' grouplasso_logreg_data <- get_grouplasso_data(n = 500,response = "binary")
#' 
#' grouplasso_logreg_cv_adapt.out <- grouplasso_logreg_cv_adapt(Y = grouplasso_logreg_data$Y,
#'                                                              X = grouplasso_logreg_data$X,
#'                                                              groups = grouplasso_logreg_data$groups,
#'                                                              n.lambda = 25,
#'                                                              lambda.min.ratio = 0.01,
#'                                                              lambda.max.ratio = 0.50,
#'                                                              n.folds = 5,
#'                                                              w = grouplasso_logreg_data$w,
#'                                                              tol = 1e-3,
#'                                                              maxiter = 500,
#'                                                              report.prog = FALSE)
#' @export
grouplasso_logreg_cv_adapt <- function(Y,X,groups,n.lambda,lambda.min.ratio,lambda.max.ratio,n.folds,w,tol=1e-3,maxiter=500,report.prog = TRUE){
  
  
  # find lambda.max
  q <- length(unique(groups))
  n <- nrow(X)
  
  norms <- numeric(q)
  for(j in 2:q){
    
    ind <- which(groups == j)
    
    if(j <= q){
      
      norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y - mean(Y)))^2)) / w[j]
      
    }
    
  }
  
  # smallest value of lambda which sets all the non-intercept entries of beta1 and beta2 equal to zero.
  lambda.max <- 2 * max(norms)
  lambda.initial.fit <- lambda.min.ratio * lambda.max
  
  # fit a grouplasso with lambda as lambda.min.ratio * lambda.max.
  grouplasso_logreg.out <- grouplasso_logreg(rY = Y,
                                             rX = X,
                                             groups = groups,
                                             lambda = lambda.initial.fit,
                                             w = w,
                                             tol = tol,
                                             maxiter = maxiter)
  
  
  # now make a new value of w based on this initial fit
  for(j in 1:q){
    
    ind <- which(groups == j)
    w[j] <- min(w[j]/sqrt( sum( grouplasso_logreg.out$beta.hat[ind]^2 )),1e10) # replace Inf with 1e10
    
  }
  
  # obtain lambda.seq from the grid function, as well as the fits on the entire data set, which will be used as initial values for the crossvalidation training fits.
  grouplasso_logreg_grid.out <- grouplasso_logreg_grid(Y = Y,
                                                       X = X,
                                                       groups = groups,
                                                       n.lambda = n.lambda,
                                                       lambda.min.ratio = lambda.min.ratio,
                                                       lambda.max.ratio = lambda.max.ratio,
                                                       w = w,
                                                       tol = tol,
                                                       maxiter = maxiter,
                                                       report.prog = report.prog)
  
  lambda.seq <- grouplasso_logreg_grid.out$lambda.seq
  b.mat <- grouplasso_logreg_grid.out$b.mat
  
  # do the crossvalidation
  grouplasso_logreg_cv_fixedgrid.out <- grouplasso_logreg_cv_fixedgrid(Y = Y,
                                                                       X = X,
                                                                       groups = groups,
                                                                       lambda.seq = lambda.seq,
                                                                       n.folds = n.folds,
                                                                       b.init.mat = b.mat,
                                                                       w = w,
                                                                       tol = tol,
                                                                       maxiter = maxiter)
  
  output <- list( b.mat = b.mat,
                  b.folds.arr = grouplasso_logreg_cv_fixedgrid.out$b.folds.arr,
                  minus2ll.mat = grouplasso_logreg_cv_fixedgrid.out$minus2ll.mat,
                  which.lambda.cv = grouplasso_logreg_cv_fixedgrid.out$which.lambda.cv,
                  lambda.seq = lambda.seq,
                  lambda.initial.fit = lambda.initial.fit,
                  w = w,
                  iterations = grouplasso_logreg_cv_fixedgrid.out$iterations)
  
  return(output)
  
}



#' Compute group lasso for two populations with group testing data
#'
#' @param Y Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{masterpool.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Z Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{masterpool.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Se A vector of testing sensitivities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param Sp A vector of testing specificities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param X the matrix with the observed covariate values (including a column of ones for the intercept)
#' @param E.approx a logical indicating whether the conditional expectations in the E-step should be computed approximately or exactly.
#' @param groups a vector indicating to which group each covariate belongs
#' @param lambda the level of sparsity penalization
#' @param w group-specific weights for different penalization of different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations (EM steps)
#' @param init a list of initial values for the coefficient 
#' @param report.prog a logical. If \code{TRUE} then the number of inner loops required to complete the M step of the EM algorithm are returned after each EM step.
#' @return Returns the estimator of the semiparametric additive model with group testing data
#' @examples 
#' grouplasso_gt_data <- get_grouplasso_data(n = 1000, response = "gt")
#' 
#' grouplasso_gt.out <- grouplasso_gt(Y = grouplasso_gt_data$Y$I,
#'                                    Z = grouplasso_gt_data$Y$A,
#'                                    Se = grouplasso_gt_data$Y$Se,
#'                                    Sp = grouplasso_gt_data$Y$Sp,
#'                                    E.approx = grouplasso_gt_data$Y$E.approx,
#'                                    X = grouplasso_gt_data$X,
#'                                    groups = grouplasso_gt_data$groups,
#'                                    lambda = 1,
#'                                    w  = grouplasso_gt_data$w,
#'                                    tol = 1e-3,
#'                                    maxiter = 500,
#'                                    report.prog = TRUE)
#' @export
grouplasso_gt <- function(Y,Z,Se,Sp,E.approx,X,groups,lambda,w,tol=1e-3,maxiter=1000,init=NULL,report.prog=FALSE)
{
  
  # set function to get conditional expectations
  get.EY <- eval(parse(text = ifelse(E.approx, "EYapprox","EYexact")))
  
  # set initial values
  if(length(init) == 0){
    
    beta.hat1 <- rep(0,ncol(X))
    
  } else{
    
    beta.hat1 <- init
    
  }
  
  ###### Do the EM-algorithm with penalized updates
  conv <- 1
  iter <- 0
  inner.iter <- numeric()
  while( conv > tol & iter < maxiter)
  {
    
    beta.hat0 <- beta.hat1
    
    
    # E-step: compute the conditional expectations for the true disease statuses
    EY <- as.numeric(get.EY(Z,Y,X=X,b=beta.hat1,Se,Sp))
    
    # update initial values
    init <- beta.hat1
    
    # M-step: maximize the objective function with conditional expectations substituted
    grouplasso_logreg.out <- grouplasso_logreg(rY = EY,
                                               rX = X,
                                               groups = groups,
                                               lambda = lambda,
                                               w = w,
                                               tol = tol,
                                               maxiter = maxiter,
                                               beta_init = beta.hat1)
    
    beta.hat1 <- grouplasso_logreg.out$beta.hat
    
    conv <- max(abs(beta.hat1 - beta.hat0))
    iter <- iter + 1
    inner.iter[iter] <- grouplasso_logreg.out$iter
    
    if(report.prog) print(grouplasso_logreg.out$iter)
    
  }
  
  output <- list( beta.hat = beta.hat1,
                  inner.iter = inner.iter)
  
}


#' Fit grouplasso logistic regression estimator with group testing data over a grid of lambda values
#'
#' @param Y Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{masterpool.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Z Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{masterpool.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Se A vector of testing sensitivities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param Sp A vector of testing specificities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param X the matrix with the observed covariate values (including a column of ones for the intercept)
#' @param E.approx a logical indicating whether the conditional expectations in the E-step should be computed approximately or exactly.
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values desired
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param w group-specific weights for different penalization of different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations (EM steps)
#' @param report.prog a logical. If \code{TRUE} then the number of inner loops required to complete the M step of the EM algorithm are returned after each EM step.
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' @examples
#' grouplasso_gt_data <- get_grouplasso_data(n = 1000, response = "gt")
#' 
#' grouplasso_gt_grid.out <- grouplasso_gt_grid(Y = grouplasso_gt_data$Y$I,
#'                                              Z = grouplasso_gt_data$Y$A,
#'                                              Se = grouplasso_gt_data$Y$Se,
#'                                              Sp = grouplasso_gt_data$Y$Sp,
#'                                              X = grouplasso_gt_data$X,
#'                                              E.approx = grouplasso_gt_data$Y$E.approx,
#'                                              groups = grouplasso_gt_data$groups,
#'                                              n.lambda = 10,
#'                                              lambda.min.ratio = 0.01,
#'                                              lambda.max.ratio = 0.50,
#'                                              w  = grouplasso_gt_data$w,
#'                                              tol = 1e-3,
#'                                              maxiter = 500,
#'                                              report.prog = TRUE)
#' @export
grouplasso_gt_grid <- function(Y,Z,Se,Sp,X,E.approx = FALSE,groups,n.lambda,lambda.min.ratio,lambda.max.ratio=1,w,tol=1e-4,maxiter=500,report.prog=FALSE)
{
  # get diagnoses; determine lambda sequence using these.
  Y.diag <- pull.diagnoses(Z,Y)
  
  # find lambda.max
  q <- length(unique(groups))
  n <- nrow(X)
  
  norms <- numeric(q)
  for(j in 2:q)
  {
    
    ind <- which(groups == j)
    
    if(j <= q){
      
      norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y.diag - mean(Y.diag)))^2)) / w[j]
      
    }
    
  }
  
  lambda.max <- 2 * max(norms) # yes this is correct! This is the smallest value of lambda which sets all the non-intercept entries of beta equal to zero.
  
  # make a lambda sequence
  largest.lambda <- lambda.max.ratio * lambda.max
  smallest.lambda <- lambda.min.ratio * lambda.max
  lambda.seq <- sort(c(exp(log(smallest.lambda) + ((n.lambda):1)/(n.lambda) * ((log(largest.lambda) - log(smallest.lambda))))))
  
  if(n.lambda == 1) lambda.seq <- lambda.min
  
  # fit over the values in the lambda sequence using warm starts
  b.mat <- matrix(0,ncol(X),n.lambda)
  iterations <- matrix(0,n.lambda,2)
  colnames(iterations) <- c("lambda","iter")
  step <- 0
  init <- NULL
  for(l in 1:n.lambda){
    
    grouplasso_logreg.out <- grouplasso_gt(Y = Y,
                                           Z = Z,
                                           Se = Se,
                                           Sp =Sp,
                                           X = X,
                                           groups = groups,
                                           lambda = lambda.seq[l],
                                           w = w,
                                           E.approx = E.approx,
                                           tol = tol,
                                           maxiter = maxiter,
                                           init = init)
    
    b <- grouplasso_logreg.out$beta.hat
    
    init <- b
    b.mat[,l] <- b
    
    step <- step + 1
    iterations[step,] <- c(lambda.seq[l],grouplasso_logreg.out$iter)
    
    if(report.prog == TRUE){
      
      print(c(l,grouplasso_logreg.out$iter))
      
    }
    
  }
  
  output <- list( b.mat = b.mat,
                  lambda.seq = lambda.seq,
                  iterations = iterations)
  
  return(output)
  
}



#' Choose tuning parameters by crossvalidation for grouplasso logreg when given a fixed grid of lambda values
#'
#' @param Y Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Z Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Se A vector of testing sensitivities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param Sp A vector of testing specificities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param X matrix containing the design matrices
#' @param groups a vector indicating to which group each covariate belongs
#' @param lambda.seq sequence of lambda values
#' @param n.folds the number of crossvalidation folds
#' @param b.init.mat matrix of which the columns contain initial values for beta for the values of the tuning parameter in \code{lambda.seq}
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization of different groups
#' @param tol the convergence tolerance
#' @param maxiter the maximum number of iterations allowed for each fit
#' @return a list containing the fits over a grid of lambda values as well as the vector of lambda values
#' 
#' Note that the crossvalidation is carried out using the individual diagnoses as though they were the true individual disease statuses.
#' 
#' @examples
#' grouplasso_gt_data <- get_grouplasso_data(n = 1000, response = "gt")
#' 
#' grouplasso_gt_grid.out <- grouplasso_gt_grid(Y = grouplasso_gt_data$Y$I,
#'                                              Z = grouplasso_gt_data$Y$A,
#'                                              Se = grouplasso_gt_data$Y$Se,
#'                                              Sp = grouplasso_gt_data$Y$Sp,
#'                                              X = grouplasso_gt_data$X,
#'                                              E.approx = grouplasso_gt_data$Y$E.approx,
#'                                              groups = grouplasso_gt_data$groups,
#'                                              n.lambda = 10,
#'                                              lambda.min.ratio = 0.01,
#'                                              lambda.max.ratio = 0.50,
#'                                              w  = grouplasso_gt_data$w,
#'                                              tol = 1e-3,
#'                                              maxiter = 500,
#'                                              report.prog = TRUE)
#' 
#' grouplasso_gt_cv_fixedgrid.out <- grouplasso_gt_cv_fixedgrid(Y = grouplasso_gt_data$Y$I,
#'                                                              Z = grouplasso_gt_data$Y$A,
#'                                                              Se = grouplasso_gt_data$Y$Se,
#'                                                              Sp = grouplasso_gt_data$Y$Sp,
#'                                                              X = grouplasso_gt_data$X,
#'                                                              groups = grouplasso_gt_data$groups,
#'                                                              lambda.seq = grouplasso_gt_grid.out$lambda.seq,
#'                                                              n.folds = 5,
#'                                                              b.init.mat = grouplasso_gt_grid.out$b.mat,
#'                                                              w = grouplasso_gt_data$w,
#'                                                              tol = 1e-3,
#'                                                              maxiter = 500)
#' @export
grouplasso_gt_cv_fixedgrid <- function(Y,Z,Se,Sp,X,groups,lambda.seq,n.folds,b.init.mat,w,tol=1e-3,maxiter=500)
{
  
  # for now just use the diagnoses as the true responses and use the logreg fits on these
  Y.diag <- pull.diagnoses(Z,Y)
  
  # create list of sets of indices indicating which observations are in each fold
  n <- nrow(X)
  
  folds <- vector("list",n.folds)
  fold.size <- floor(n / n.folds)
  for(fold in 1:n.folds){
    
    folds[[fold]] <- ((fold-1)*fold.size + 1):(fold*fold.size)
    
  }
  
  if( floor(n / n.folds) != n / n.folds ){
    
    folds[[n.folds]] <- c(folds[[n.folds]],(fold*fold.size+1):n)
    
  }
  
  # get fits at all lambda values on all cv folds
  n.lambda <- length(lambda.seq)
  b.folds.arr <- array(0,dim=c(ncol(X),n.lambda,n.folds))
  
  minus2ll.mat <- matrix(0,n.lambda,n.folds)
  
  iterations <- matrix(0,n.lambda,1+n.folds)
  colnames(iterations) <- c("lambda",paste("fold",1:n.folds,"iter"))
  step <- 1
  
  for(l in 1:n.lambda){
    
    iterations[step,1] <- lambda.seq[l]
    
    for(fold in 1:n.folds){
      
      fold.ind <- folds[[fold]]
      
      grouplasso_logreg.out <- grouplasso_logreg(rY = Y.diag[-fold.ind],
                                                 rX = X[-fold.ind,],
                                                 groups = groups,
                                                 lambda = lambda.seq[l]*(n.folds - 1)/n.folds,
                                                 w = w,
                                                 tol = tol,
                                                 maxiter = maxiter,
                                                 beta_init = b.init.mat[,l])
      
      
      b.fold <- grouplasso_logreg.out$beta.hat
      b.folds.arr[,l,fold] <- b.fold
      
      P.fold <- logit(X[fold.ind,] %*% b.fold)
      minus2ll.fold <- - 2 * sum( Y.diag[fold.ind] * log(P.fold) + (1-Y.diag[fold.ind]) * log( 1 - P.fold) )
      minus2ll.mat[l,fold] <- minus2ll.fold
      
      iterations[step,1+fold] <- grouplasso_logreg.out$iter
      
    }
    
    step <- step + 1
    
  }
  
  meanCVll <- apply(minus2ll.mat,1,mean)
  which.lambda.cv <- which.min(meanCVll)
  
  output <- list( b.folds.arr = b.folds.arr,
                  minus2ll.mat = minus2ll.mat,
                  which.lambda.cv = which.lambda.cv,
                  lambda.seq = lambda.seq,
                  iterations = iterations)
  
  return(output)
  
}


#' Choose tuning parameters for the group lasso estimator with group testing data
#'
#' @param Y Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Z Group testing output in the format as output by one of the functions \code{individual.assay.gen}, \code{dorfman.assay.gen}, or \code{array.assay.gen}.
#' @param Se A vector of testing sensitivities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param Sp A vector of testing specificities, where the first element is the
#'      testing specificity for pools and the second entry is the
#'      test specificity for individual testing, if applicable.
#' @param X matrix containing the design matrices
#' @param E.approx a logical indicating whether the conditional expectations in the E-step should be computed approximately or exactly.
#' @param groups a vector indicating to which group each covariate belongs
#' @param n.lambda the number of lambda values
#' @param lambda.min.ratio ratio of the smallest lambda value to the smallest value of lambda which admits no variables to the model
#' @param lambda.max.ratio ratio of the largest lambda value to the smallest value of lambda which admits no variables to the model
#' @param n.folds the number of crossvalidation folds
#' @param w group-specific weights for different penalization toward similarity for different groups
#' @param tol a convergence criterion
#' @param maxiter the maximum allowed number of iterations (EM steps)
#' @param report.prog a logical. If \code{TRUE} then the number of inner loops required to complete the M step of the EM algorithm are returned after each EM step.
#' @return Returns the estimator of the parametric model with group testing data
#'
#' @examples
#' grouplasso_gt_data <- get_grouplasso_data(n = 1000, response = "gt")
#' 
#' grouplasso_gt_cv.out <- grouplasso_gt_cv_adapt(Y = grouplasso_gt_data$Y$I,
#'                                                Z = grouplasso_gt_data$Y$A,
#'                                                Se = grouplasso_gt_data$Y$Se,
#'                                                Sp = grouplasso_gt_data$Y$Sp,
#'                                                X = grouplasso_gt_data$X,
#'                                                E.approx = grouplasso_gt_data$Y$E.approx,
#'                                                groups = grouplasso_gt_data$groups,
#'                                                n.lambda = 10,
#'                                                lambda.min.ratio = 0.01,
#'                                                lambda.max.ratio = 0.50,
#'                                                n.folds = 5,
#'                                                w  = grouplasso_gt_data$w,
#'                                                tol = 1e-3,
#'                                                maxiter = 500,
#'                                                report.prog = TRUE)
#' @export
grouplasso_gt_cv_adapt <- function(Y,Z,Se,Sp,X,E.approx = FALSE,groups,n.lambda,n.eta,lambda.min.ratio,lambda.max.ratio,n.folds,w,tol=1e-3,maxiter=1000,report.prog=TRUE)
{
  
  # pull the individual diagnoses to be treated as true disease statuses to find the sequence of lambda values.
  Y.diag <- pull.diagnoses(Z,Y)
  
  # find lambda.max and lambda.min
  q <- length(unique(groups))
  n <- nrow(X)
  
  norms <- numeric(q)
  for(j in 2:q)
  {
    
    ind <- which(groups == j)
    norms[j] <- sqrt(sum((t(X[,ind]) %*% (Y.diag - mean(Y.diag)))^2)) / w[j] 
    
  }
  
  # smallest value of lambda which sets all the non-intercept entries of beta1 and beta2 equal to zero.
  lambda.max <- 2 * max(norms)
  lambda.initial.fit <- lambda.min.ratio * lambda.max
  
  # fit a grouplasso_gt with eta = 0 and lambda as lambda.min.ratio*lambda.max.
  grouplasso_gt.out <- grouplasso_gt(Y = Y,
                                     Z = Z,
                                     Se = Se,
                                     Sp = Sp,
                                     X = X,
                                     groups = groups,
                                     lambda = lambda.initial.fit,
                                     w = w,
                                     E.approx = E.approx,
                                     tol = tol,
                                     maxiter = maxiter,
                                     report.prog = FALSE)
  
  # now make new values of w based on these.
  for(j in 1:q){
    
    ind <- which(groups == j)
    w[j] <- min(w[j]/sqrt( sum( grouplasso_gt.out$beta.hat[ind]^2 )),1e10) #replace Inf with 1e10
    
  }
  
  # obtain lambda.seq and eta.seq from the grid function, as well as the fits on the entire data set, which will be used as initial values for the crossvalidation training fits.
  grouplasso_gt_grid.out <- grouplasso_gt_grid(Y = Y,
                                               Z = Z,
                                               Se = Se,
                                               Sp = Sp,
                                               X = X,
                                               groups = groups,
                                               n.lambda = n.lambda,
                                               lambda.min.ratio = lambda.min.ratio,
                                               lambda.max.ratio = lambda.max.ratio,
                                               w = w,
                                               E.approx = E.approx,
                                               tol = tol,
                                               maxiter = maxiter,
                                               report.prog = report.prog)
  
  # do the crossvalidation
  grouplasso_gt_cv_fixedgrid.out <- grouplasso_gt_cv_fixedgrid(Y = Y,
                                                               Z = Z,
                                                               Se = Se,
                                                               Sp = Sp,
                                                               X = X,
                                                               groups = groups,
                                                               lambda.seq = grouplasso_gt_grid.out$lambda.seq,
                                                               n.folds = n.folds,
                                                               b.init.mat = grouplasso_gt_grid.out$b.mat,
                                                               w = w,
                                                               tol = tol,
                                                               maxiter = maxiter)
  
  output <- list( b.mat = grouplasso_gt_grid.out$b.mat,
                  b.folds.arr = grouplasso_gt_cv_fixedgrid.out$b.folds.arr,
                  minus2ll.mat = grouplasso_gt_cv_fixedgrid.out$minus2ll.mat,
                  which.lambda.cv = grouplasso_gt_cv_fixedgrid.out$which.lambda.cv,
                  lambda.seq = grouplasso_gt_grid.out$lambda.seq,
                  lambda.initial.fit = lambda.initial.fit,
                  w = w,
                  iterations = grouplasso_gt_cv_fixedgrid.out$iterations)
  
  return(output)
  
}
gregorkb/semipadd2pop documentation built on Oct. 2, 2022, 1:37 p.m.