PGS: Penalized GEE with Grid Search

Documented in pgsfit

#' Penalized Generalized Estimation Equation with Grid Search
#' 
#' \code{pgsfit} is used to fit and determine the best results from penalized GEE method across tunning parameter grid.
#' 
#' @param y.vect a vector of dependent variable.
#' @param id.vect a vector of subjuect ID.
#' @param M a data frame or matrix of genomic dataset. Rows represent samples, columns represent variables.
#' @param COV a data frame or matrix of covariates dataset.
#' @param sis.obj a \code{sis} object. See \code{\link{sis}}.
#' @param lambda.n an integer specifying the number of tunning parameter lambda, the range of lambda is specifyied by \code{lambda.lim}. Default = 30.
#' @param lambda.lim a vector with two numbers specifying the limit of changing lambda for PGS to tune lambda. The lambda sequence is generated by \code{exp(-seq(lambda.lim[1], lambda.lim[2], length = lambda.n))}. Default = c(2,5).
#' @param pm.n an integer specifying the number of Pm levels, starting from 10 to \code{pm.n}. Default = 10. Pm is the number of top ranking variables from \code{\link{sis}}.    
#' @param pm.max an integer specifying the maximum Pm. Default = \code{NULL}. If \code{NULL}, \code{n/log10(n)} will be used (n is the number of total observations).
#' @param fold k-fold cross-validation in calculating grid error. Default = 10.
#' @param nonzero.eps non-zero beta threshold. During iteration, if beta estimation is shrinked down below this threshold, it will be forced to be zero. Default = \code{1e-5}.
#' @param eps convergence threshold. Iteration stops when the sum of beta estimation errors less than this threshold. Default = \code{1e-5}.
#' @param iter.n maximum iteration number. Iteration will stop anyway even if the \code{eps} is not met and throw a warning. Default = 50.
#' @param corstr a character string specifying the working correlation structure. The following are permitted: independence (\code{"indep"}), exchangeable (\code{"exch"}), autoregressive(1) (\code{"ar1"}), and unstructured (\code{"un"}). Default = \code{"ar1"}.
#' @param parallel logical. Enable parallel computing feature. Default = \code{TRUE}.
#' @param ncore number of cores to run parallel computation. Effective when \code{parallel} = \code{TRUE}. By default, max number of cores will be used.
#' @param seed an integer specifying seed for cross-validation. If not specified \code{pgsfit} will generate one.
#' 
#' @return variables selection and model fitting results in a \code{\link{pgsfit.obj}} object.
#' 
#' @seealso see \code{\link{sis}} to obtain proper ranked variables; see \code{\link{pgsfit.obj}} for class methods.
#' 
#' @examples
#' ### Dataset preview
#' BJdata()
#'
#' ### Convert binary variables into factor type
#' BJlung$gender = factor(BJlung$gender)
#' BJlung$heat = factor(BJlung$heat)
#' BJlung$cigwear = factor(BJlung$cigwear)
#' 
#' ### Merge miRNA and lung function dataset
#' BJdata <- merge(BJmirna, BJlung, by=c("SID","WD"))
#' 
#' ### Data must be sorted by study subject ID and multiple measurements indicator
#' BJdata <- BJdata[with(BJdata, order(SID, WD)), ]
#' 
#' ### Extract dependent variable (lung function)
#' y.vect<-BJdata$FEV1
#' 
#' ### Extract subjuect ID variable indicating repeated measures            
#' id.vect<-BJdata$SID        
#' 
#' ### Extract microRNA data matrix   
#' M<-BJdata[,3:168]   
#' 
#' ### Extract covariate data matrix       
#' COV<-BJdata[,170:179]
#'            
#' ### In the example we use linear mixed-effect model (default) for sure independent screening, ranked by p-values
#' sis_LMM_par = sis(y.vect, id.vect, M, COV)
#' 
#' ### If your computer have multiple cores, it is recommended to enable parallel option (default)
#' PGSfit = pgsfit(y.vect, id.vect, M, COV, sis_LMM_par, lambda.lim = c(3,5), pm.n = 12, pm.max = 120, seed = 1)
#' 
#' PGSfit        # print PGSfit summary
#' plot(PGSfit)  # plot cross-validation error grid
#' coef(PGSfit)  # return PGSfit coefficients
#' 
#' #For more information, please visit: https://github.com/YinanZheng/PGS/wiki/Example:-miRNA-expression-and-lung-function
#' 
#' @export
pgsfit<-function( y.vect,                
                  id.vect,               
                  M,                     
                  COV = NULL,            
                  sis.obj,
                  lambda.n = 30,
                  lambda.lim = c(2,5),
                  pm.n = 10, 
                  pm.max = NULL,
                  fold = 10,
                  nonzero.eps = 1e-5,
                  eps = 1e-5,           
                  iter.n = 50,
                  corstr = "ar1",        
                  parallel = TRUE,   
                  ncore = detectCores(), 
                  seed = NULL )
{
  cat("Missing value check...\n")
  component_list = c("Dependent variable", "Genomic data matrix", "Covaraites")
  missingCheck = c( length( (y.vect_missing = which(is.na(y.vect)))) , sum(!is.na((M_missing = which(is.na(M), arr.ind = T)[1]))) , (COV_missing = 0) )
  if(!is.null(COV)) missingCheck[3] = sum(!is.na((COV_missing = which(is.na(COV), arr.ind = T)[1])))
  if ( any( as.logical(missingCheck) ) )
  {
    missing_index = unique(na.omit(c(y.vect_missing, M_missing, COV_missing)))
    cat(paste0("Missing value(s) exist in ", paste(component_list[which(as.logical(missingCheck) == TRUE)], collapse = " & " ), ". ID = ", paste(id.vect[missing_index], collapse = ","), ". Any observation(s) with missing value(s) will be dropped.\n"))
    confirm = TRUE
    while(confirm)
    {
      cat("Do you want to continue? (y/n)\n")
      stdinLetter = readLines(n=1)
      if(stdinLetter == "n") stop("PGS stopped.") 
      else if(stdinLetter != "y") confirm = TRUE else {
        y.vect = y.vect[-missing_index]; M = M[-missing_index,]; COV = COV[-missing_index,]; id.vect = id.vect[-missing_index]
        cat("Missing observation(s) dropped.\n")
        confirm = FALSE
      }
    }
  } else { cat("No missing value found.\n") }
  
  id.vect = sort(as.numeric(as.factor(as.character(id.vect)))) # Convert any id format to number.
  indGen_res = indGen_cpp(id.vect)
  
  if (!is.null(COV)) {COV <- as.matrix((model.matrix(~.,as.data.frame(COV)))[,-1]); n.COV = ncol(COV); COV_scale = scaleto(COV)} else {n.COV = 0; COV_scale = scaleto(NULL)}
  if (is.null(seed)) seed = round(as.numeric(Sys.time()))
  
  cat(paste0("The complete working dataset contains ", indGen_res$n, " individuals and a total of ", indGen_res$obs_n, " observations. (seed = ", seed,")\n"))

  if(is.null(pm.max)) pm.max = indGen_res$obs_n/log10(indGen_res$obs_n)
  Pm.vect = (pm.vect <- ceiling(seq(10, pm.max, length = pm.n)))
  lam.vect = exp(-seq(lambda.lim[1], lambda.lim[2], length = lambda.n))
  n.marks = ncol(M)
  PGS.limit = min( (indGen_res$obs_n - n.COV - 1), n.marks) # The max number of p (exclude COV) that PGS can handles should not exceed the number of samples
  sis.vect = sis.obj$name[1:PGS.limit]
  M = M[,match(sis.vect, colnames(M))]
  y.vect_scale = scaleto(y.vect)
  M_scale = scaleto(M)
  
  if (pm.max > PGS.limit)
  {
    Pm.vect = c(Pm.vect[Pm.vect < PGS.limit], PGS.limit)
    cat(paste0("Besides the covariates, the maximum number of variables that PGS can afford is ", indGen_res$obs_n - n.COV - 1,". The max number of working variables = ", PGS.limit,". Your Pm.vect has been truncated to ", paste0(Pm.vect,collapse = " ")),'\n')
    pm.n = length(Pm.vect)
  }
    
  # Initialize rooms to store results
  grid.err = matrix(nrow = pm.n, ncol = lambda.n)   # Initiate error grid, column = lambda, row = Pm
  lam.sel.vect = rep(NA, pm.n)  # Initiate optimized lambda for each Pm
  beta.shrink.corr.list = vector("list", pm.n)   # Initiate penalized Beta (coefficient) estimates for each Pm
  var.sand.corr.list = vector("list", pm.n)  # Initiate Variance 
  flag.stop.corr.vect = rep(NA, pm.n) # Initiate stop flag
  iter.n.corr.vect = rep(NA, pm.n) # Initiate iteration time
  hat.R.list = vector("list", pm.n) # Initiate working correlation matrix

  checkParallel("PGS", parallel, ncore)
  
  cat(paste0("Tunning grid:"),'\n'); cat(paste0("   Pm = ", paste0(Pm.vect,collapse = " ")),'\n'); cat(paste0("   -ln(lambda) = ", lambda.lim[1], " to ", lambda.lim[2], " (", lambda.n, ")"),'\n')
  
  res_par <- foreach(M_chunk = iblkcol_cum(M_scale$d,Pm.vect), .packages = c("PGS","geepack") ) %dopar% {
   set.seed(seed)
   x.mat<-cbind(M_chunk, COV_scale$d)
   p = ncol(x.mat)
   dat.ini = data.frame(y.vect = y.vect_scale$d, x.mat)
   modelstatement = eval(parse(text=(paste0("y.vect ~ ", paste0(colnames(dat.ini)[-1],collapse = "+")))))  
   model <- try(geeglm(modelstatement, data = dat.ini, id = id.vect, corstr = corstr))
   beta.ini = as.numeric(summary(model)$coefficients[-1,1])
   one_run_grid_cpp(y.vect_scale$d, x.mat, id.vect, beta.ini, fold, p, lam.vect, nonzero.eps, eps, iter.n, corstr)
  }

  ## Summarize the results
  for(i in 1:pm.n)
  {
    temp.run = res_par[[i]]
    grid.err[i,] <- temp.run$lam.cv.cor
    lam.sel.vect[i] <- temp.run$lam.sel.cor
    beta.shrink.corr.list[[i]] <- temp.run$beta.shrink.cor
    var.sand.corr.list[[i]] <- temp.run$var.sand.cor
    flag.stop.corr.vect[i] <- temp.run$flag.stop.cor
    iter.n.corr.vect[i] <- temp.run$iter.n.cor
    hat.R.list[[i]] <- temp.run$hat.R
  }
  rownames(grid.err) <- Pm.vect; colnames(grid.err) <- lam.vect
  bestall.ind <- which(grid.err==min(grid.err), arr.ind=T)
  grid.err.converge <- grid.err[iter.n.corr.vect < iter.n, ]
  if(is.na(grid.err.converge[1])) {best.ind <- bestall.ind; ConvergeMessage = "Not converged!"} else {
    best.ind <- which(grid.err == min(grid.err.converge), arr.ind=T)
    if(identical(best.ind, bestall.ind)) ConvergeMessage = "Converged!" else ConvergeMessage = "Conditionally converged!"
  }
 
  scale.info = list(y.vect_scale.ds = y.vect_scale$ds, M_scale.dn = M_scale$dn, M_scale.ds = M_scale$ds, COV_scale.dn = COV_scale$dn, COV_scale.ds = COV_scale$ds)
  res <- pgsfit.obj(grid.err, lam.sel.vect, beta.shrink.corr.list, var.sand.corr.list, hat.R.list, flag.stop.corr.vect, iter.n.corr.vect, best.ind, bestall.ind, eps, iter.n, Pm.vect, lam.vect, scale.info, sis.obj$name[1:Pm.vect[pm.n]], ConvergeMessage)
  print(res)
  cat(paste0("Done!    (",Sys.time(),")\n"))
  return(res)
}