R/StepOne.R
In SMAHP: Survival Mediation Analysis of High-Dimensional Proteogenomic Data

Documented in StepOne

#' Penalized mediation and outcome models
#'
#' An auxiliary function which conducts variable selection of X for Y using penAFT, X for M using the selected penalized model and M for Y using penAFT.
#' @param X An n by p matrix of exposures.
#' @param M An n by p matrix of mediators.
#' @param time A vector of survival time of samples.
#' @param status A vector of status indicator: 0=alive, 1=dead.
#' @param model_option The penalization method used when selecting X for M. The options include MCP, elastic net and lasso. Default is MCP.
#' @return A list with the following components:
#'   \item{X_sel_Y_s1}{X selected for Y using penAFT}
#'   \item{M_X_s1}{X selected for M using the selected penalized model}
#'   \item{M_sel_Y_s1}{M selected for Y using penAFT}
#' @import penAFT
#' @import ncvreg
#' @import glmnet
#' @export
#' @examples
#' \donttest{
#' data(example_dat)
#' surv_dat <- example_dat$surv_dat
#' res_step1 <- StepOne(X = example_dat$X, M = example_dat$M, time = surv_dat$time,
#' status = surv_dat$status, model_option = "MCP")
#' }
#'

StepOne <- function(X, M, time, status, model_option = "MCP") {
  d_x <- ncol(X)
  d_m <- ncol(M)

  ## step 1
  ### select X for survival outcome using penAFT
  fit <- penAFT.cv(X = X, logY = log(time), delta = status,
                   penalty = "EN", alpha = 1)
  coef_sel <- penAFT.coef(fit)$beta
  coef_pos <- which(coef_sel != 0)
  X_sel_Y_s1 <- colnames(X)[coef_pos]

  ### select X for M
  M_X_s1 <- list()
  for(m in colnames(M)){
    if(model_option == "MCP"){
      pen_model <- cv.ncvreg(X = X, y = M[ ,m], family = "gaussian", penalty = "MCP")
      pred_sel <- rownames(summary(pen_model)$fit_summary$table)
    } else{
      if(model_option == "elastic net"){
        alpha_glmnet <- 0.5
      }
      if(model_option == "lasso"){
        alpha_glmnet <- 1
      }
      cv_fit <- cv.glmnet(x = X, y = M[ ,m], family = "gaussian", alpha = alpha_glmnet)
      best_lambda <- cv_fit$lambda.min
      beta_coefficients <- coef(cv_fit, s = best_lambda)
      beta_df <- as.data.frame(as.matrix(beta_coefficients))
      beta_df$variable <- rownames(beta_df)
      non_zero_coefficients <- beta_df[beta_df$s1 != 0, ]
      pred_sel <- non_zero_coefficients$variable
      pred_sel <- setdiff(pred_sel, "(Intercept)")
    }
    M_X_s1[[m]] <- pred_sel
  }

  ### select M for Y
  fit <- penAFT.cv(X = M, logY = log(time), delta = status,
                   penalty = "EN", alpha = 1)
  coef_sel <- penAFT.coef(fit)$beta
  coef_pos <- which(coef_sel != 0)
  M_sel_Y_s1 <- colnames(M)[coef_pos]

  return(list(X_sel_Y_s1 = X_sel_Y_s1, M_X_s1 = M_X_s1, M_sel_Y_s1 = M_sel_Y_s1))

}