R/zzz.R
In hdcuremodels: Penalized Mixture Cure Models for High-Dimensional Data

c_stat_b <-
  function(b_u_hat=NULL, itct_hat, b_p_hat, testing_delta, testing_time, X_u=NULL, X_p){
    C_csw_num = 0
    C_csw_denom = 0
    testing_n = length(testing_time)
    if(all(b_p_hat==0)) {
      if(is.null(X_u)) xb = rep(itct_hat, testing_n) else xb = itct_hat+X_u %*% b_u_hat
    } else{
      if(is.null(X_u)) xb = itct_hat + X_p[,b_p_hat!=0,drop=FALSE] %*% b_p_hat[b_p_hat!=0]
      else xb = itct_hat+X_u %*% b_u_hat + X_p[,b_p_hat!=0,drop=FALSE] %*% b_p_hat[b_p_hat!=0]
    }
    for(i in 1:testing_n)
      for(j in 1:testing_n){
        if (j==i | !testing_delta[i] | testing_time[i]>testing_time[j]) next
        I_ij = testing_time[i]<testing_time[j] | (testing_time[i]==testing_time[j] & !testing_delta[j])
        if (!I_ij) next
        if (xb[i]>xb[j]) C_csw_num = C_csw_num + 1
        C_csw_denom = C_csw_denom + 1
      }
    return(C_csw_num / C_csw_denom)
  }

c_stat_beta <-
  function(beta_u_hat=NULL,  beta_p_hat, testing_delta, testing_time, W_u=NULL, W_p){
    C_csw_num = 0
    C_csw_denom = 0
    testing_n = length(testing_time)
    if(all(beta_p_hat==0)) {
      if(is.null(W_u)) W_beta = rep(0, testing_n) else W_beta = W_u %*% beta_u_hat
    } else{
      if(is.null(W_u)) W_beta = W_p[,beta_p_hat!=0,drop=FALSE] %*% beta_p_hat[beta_p_hat!=0]
      else W_beta = W_u %*% beta_u_hat + W_p[,beta_p_hat!=0,drop=FALSE] %*% beta_p_hat[beta_p_hat!=0]
    }
    for(i in 1:testing_n)
      for(j in 1:testing_n){
        if (j==i | !testing_delta[i] | testing_time[i]>testing_time[j]) next
        I_ij = testing_time[i]<testing_time[j] | (testing_time[i]==testing_time[j] & !testing_delta[j])
        if (!I_ij) next
        if (W_beta[i]>W_beta[j]) C_csw_num = C_csw_num + 1
        C_csw_denom = C_csw_denom + 1
      }
    return(C_csw_num / C_csw_denom)
  }



cox_l1 <-
  function(X_u=NULL, X_p=NULL, W_u=NULL, W_p=NULL, time, delta, mu_inc, mu_lat, inits=NULL,
           nIter=100, tol = 1e-4)
  {
    # mu: penalty parameter
    # tol: difference between log-likelihood
    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates

    CAP = 10

    event_time = time[delta==1]
    uniq_event_time = unique(event_time)
    n0 = length(uniq_event_time)
    I0 = matrix(time, ncol = 1) %*% matrix(1, 1, n0) >=
      matrix(1, N, 1) %*% matrix(uniq_event_time, nrow = 1)    # N*n0
    d_j = as.numeric(table(event_time)[rank(unique(event_time))])   #  number of events at time T_j
    T_n0 = max(event_time)   # last event
    tail_ind = which(time>=T_n0 & delta==0)

    ######## initialization ########
    step = 1
    if (!is.null(X_p)) b_p <- rep(0,J) # KJA added to allow no penalized incidence variables
    if (!is.null(W_p)) beta_p <- rep(0,M) # KJA added added to allow no penalized latency variables
    if(is.null(inits)) inits = initialization_cox(X_u, W_u, time, delta)
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u; survprob = inits$survprob
    if(is.null(X_u)) b_x = rep(itct,N) else b_x = itct+X_u %*% b_u
    if(is.null(X_u)) { # KJA changed to allow no penalized incidence variables
      pen_fac_inc = rep(1, ncol(X_p))
    } else if (is.null(X_p)) {
      pen_fac_inc = rep(0, ncol(X_u))
    } else {
      pen_fac_inc = c(rep(0, ncol(X_u)), rep(1, ncol(X_p)))
    }
    if(is.null(W_u)) {
      pen_fac_lat = rep(1, ncol(W_p))
    } else if (is.null(W_p)) {
      pen_fac_lat = rep(0, ncol(W_u))
    } else {
      pen_fac_lat = c(rep(0, ncol(W_u)), rep(1, ncol(W_p)))
    }
    pir = rep(1, N)
    llp1_0 <- llp2_0 <- 0
    lik_inc <- lik_lat <- c()
    b_p_path <- beta_p_path <- b_u_path <- beta_u_path <- itct_path <- NULL
    conv1 <- conv2 <- FALSE

    ######## loop ########

    repeat{

      #### E-step
      pi_x = 1/(1+exp(-b_x))
      numerator = pi_x[delta==0] * survprob[delta==0]
      pir[delta==0] = numerator/(1-pi_x[delta==0] + numerator)

      #### M-step
      ### incidence
      if (!conv1){
        fit_inc = glmnet::glmnet(x = cbind(X_u, X_p), y = cbind(1-pir, pir), family = "binomial",
                                 penalty.factor = pen_fac_inc, lambda=mu_inc, standardize = FALSE)
        coef_inc = coef(fit_inc)
        itct = max(-CAP, min(CAP, coef_inc[1]))
        if(is.null(X_u)) {
          b_p = pmax(-CAP, pmin(CAP, coef_inc[-1]))
          b_x = itct + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
        } else if (is.null(X_p)) {
          b_u = pmax(-CAP, pmin(CAP, coef_inc[2:(ncol(X_u)+1)]))
          b_x = itct + X_u %*% b_u
        } else {
          b_u = pmax(-CAP, pmin(CAP, coef_inc[2:(ncol(X_u)+1)]))
          b_p = pmax(-CAP, pmin(CAP, coef_inc[-(1:(ncol(X_u)+1))]))
          b_x = itct + X_u %*% b_u + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
        }
        llp1_1 = sum(pir*b_x - log(1+exp(b_x))) - N * mu_inc *sum(abs(b_p))
      }
      lik_inc = c(lik_inc, llp1_1)

      ### latency
      if (!conv2){
        nz_pir = which(pir>0)
        fit_lat = glmnet::glmnet(x = cbind(W_u, W_p)[nz_pir,],
                                 y = Surv(time = time[nz_pir], event = delta[nz_pir]),
                                 family = "cox",
                                 offset = log(pir[nz_pir]),
                                 penalty.factor = pen_fac_lat,
                                 lambda = mu_lat, standardize = FALSE)
        coef_lat = coef(fit_lat)
        if (is.null(W_u)) {
          beta_p = pmax(-CAP, pmin(CAP, as.numeric(coef_lat)))
          beta_w = W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
        } else if (is.null(W_p)) {
          beta_u = pmax(-CAP, pmin(CAP, coef_lat[1:ncol(W_u)]))
          beta_w = W_u %*% beta_u
        } else {
          beta_u = pmax(-CAP, pmin(CAP, coef_lat[1:ncol(W_u)]))
          beta_p = pmax(-CAP, pmin(CAP, coef_lat[-(1:ncol(W_u))]))
          beta_w = W_u %*% beta_u + W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
        }

        ### survival
        exp_beta_w_p = exp(beta_w) * pir
        denom = matrix(exp_beta_w_p,1) %*% I0   # 1*n0
        hazard = rep(0, N)
        hazard[delta==1] = sapply(event_time, function(x) (d_j/denom)[uniq_event_time==x])
        accum_hazard = sapply(time, function(x) sum((d_j/denom)[uniq_event_time<=x]))
        surv_baseline = exp(-accum_hazard)
        if (length(tail_ind)>0){
          wtail_alpha = uniroot(function(a)
            sum(-exp_beta_w_p*max(accum_hazard)*log(time/T_n0)*(time/T_n0)^a + delta/a + delta*log(time/T_n0)),
            c(0.01,10), extendInt = "yes")$root
          wtail_lambda = (max(accum_hazard))^(1/wtail_alpha)/T_n0
          surv_baseline[tail_ind] = exp(-(wtail_lambda*time[tail_ind])^wtail_alpha)
        }
        survprob = surv_baseline^exp(beta_w)
        llp2_1 = sum(log(hazard[delta==1]) + beta_w[delta==1]) - sum(exp_beta_w_p*accum_hazard) -
          N * mu_lat *sum(abs(beta_p))
      }
      lik_lat = c(lik_lat, llp2_1)

      ## record updated parameters
      itct_path = c(itct_path, itct)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      if(!is.null(X_p)) b_p_path = rbind(b_p_path, b_p)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)
      if(!is.null(W_p)) beta_p_path = rbind(beta_p_path, beta_p)
      if (!conv1 & abs(llp1_1- llp1_0)< tol) conv1 <- TRUE
      if (!conv2 & abs(llp2_1- llp2_0)< tol) conv2 <- TRUE
      if (step > 1 & (conv1 & conv2) | step >= nIter) { #
        break
      }
      llp1_0 <- llp1_1
      llp2_0 <- llp2_1
      step <- 1 + step
    }

    ######## output ########
    output <- list(b_p_path = b_p_path, beta_p_path = beta_p_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_u_path = beta_u_path,
                   lik_inc = lik_inc, lik_lat = lik_lat)
    output
  }

cox_mcp_scad <-
  function(X_u=NULL, X_p=NULL, W_u=NULL, W_p=NULL, time, delta, penalty = "MCP",
           mu_inc, mu_lat, gamma_inc = 3, gamma_lat = 3, inits = NULL,
           nIter=1000, tol = 1e-4)
  {
    # lambda: penalty parameter
    # tol: difference between log-likelihood
    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates

    CAP = 20

    event_time = time[delta==1]
    uniq_event_time = unique(event_time)
    n0 = length(uniq_event_time)
    I0 = matrix(time, ncol = 1) %*% matrix(1, 1, n0) >=
      matrix(1, N, 1) %*% matrix(uniq_event_time, nrow = 1)    # N*n0
    d_j = as.numeric(table(event_time)[rank(uniq_event_time)])   #  number of events at time T_j
    Z = sapply(uniq_event_time, function(t) colSums(W_p[time==t & delta==1,,drop=FALSE])) # M*n0
    T_n0 = max(event_time)   # last event
    tail_ind = which(time>=T_n0 & delta==0)

    ######## initialization ########
    step = 1
    if (!is.null(X_p)) b_p <- rep(0,J) # KJA added to allow no penalized incidence variables
    if (!is.null(W_p)) beta_p <- rep(0,M) # KJA added added to allow no penalized latency variables
    if(is.null(inits)) inits = initialization_cox(X_u, W_u, time, delta)
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u; survprob = inits$survprob
    if(is.null(X_u)) b_x = rep(itct,N) else b_x = itct+X_u %*% b_u
    if(is.null(W_u)) beta_w = rep(0,N) else beta_w = W_u %*% beta_u
    pir = rep(1, N)
    exp_beta_w_p = exp(beta_w) * pir
    lik_inc <- lik_lat <- c()
    b_p_path <- beta_p_path <- b_u_path <- beta_u_path <- itct_path <- NULL
    llp1_0 <- llp2_0 <- 0
    conv1 <- conv2 <- FALSE
    ######## loop ########

    repeat{

      #### E-step
      pi_x = 1/(1+exp( pmin(CAP, -b_x) ))
      numerator = pi_x[delta==0] * survprob[delta==0]
      pir[delta==0] = numerator/pmax(1e-50,1-pi_x[delta==0] + numerator)

      #### M-step
      ### incidence
      if (!conv1){
        for (l in 1:J){
          bl = b_p[l]
          b_p[l] = max(-CAP, min(CAP, mcp_scad_cd_upd_inc(penalty=penalty, pir, pi_x, X_p[,l], bl,
                                                          gamma_inc, mu_inc)))
          b_x = b_x + X_p[,l,drop=FALSE] %*% (b_p[l] - bl)
          pi_x = 1/(1+exp( pmin(CAP, -b_x) ))
        }
        out_inc = optim(par = c(itct,b_u), fn = mcp_scad_negloglik_inc, gr = mcp_scad_gradient_inc,
                        b_p = b_p, X_u =X_u, X_p = X_p, pir = pir, CAP=CAP, method="BFGS") # incidence
        itct = max(-CAP, min(CAP, out_inc$par[1]))
        if(!is.null(X_u)) b_u = pmax(-CAP, pmin(CAP, out_inc$par[2:(ncol(X_u)+1)]))
        pen = ifelse(penalty=="MCP", mcp_penalty(b_p, gamma_inc, mu_inc),
                     scad_penalty(b_p, gamma_inc, mu_inc))
        llp1_1 = -out_inc$value - N*pen
        if(is.null(X_u)) b_x = itct + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
        else b_x = itct + X_u %*% b_u + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
      }
      lik_inc = c(lik_inc, llp1_1)

      ### latency
      if (!conv2){
        for (l in 1:M){
          betal = beta_p[l]
          beta_p[l] = max(-CAP, min(CAP, mcp_scad_cd_upd_lat(penalty, exp_beta_w_p, W_p[,l], betal,
                                                             Z[l,], I0, d_j, gamma_lat, mu_lat)))
          change = W_p[,l,drop=FALSE] %*% (beta_p[l] - betal)  # N*1
          beta_w = beta_w + change  # N*1
          exp_beta_w_p = exp( pmin(CAP, beta_w) ) * pir  # N*1
          l = l + 1
        }
        if(!is.null(W_u)){
          out_lat = optim(par = beta_u, fn = mcp_scad_negloglik_lat, gr = mcp_scad_gradient_lat,
                          beta_p=beta_p, W_u=W_u, W_p=W_p, delta=delta, pir=pir, I0=I0, d_j=d_j,
                          CAP=CAP, method="BFGS") # latency
          beta_u = pmax(-CAP, pmin(CAP, out_lat$par)) # unpenalized incidence
          beta_w = W_u %*% beta_u + W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
          exp_beta_w_p = exp( pmin(CAP, beta_w) ) * pir
        }

        ### survival
        denom = matrix(exp_beta_w_p,1) %*% I0   # 1*n0
        hazard = rep(0, N)
        hazard[delta==1] = sapply(event_time, function(x) (d_j/denom)[uniq_event_time==x])
        accum_hazard = sapply(time, function(x) sum((d_j/denom)[uniq_event_time<=x]))
        surv_baseline = exp(-accum_hazard)
        if (length(tail_ind)>0){
          tmp <- try(wtail_alpha <- uniroot(function(a)
            sum(-exp_beta_w_p*max(accum_hazard)*log(time/T_n0)*(time/T_n0)^a + delta/a + delta*log(time/T_n0)),
            c(0.01,10), extendInt = "yes")$root, silent = TRUE)
          if (!inherits(tmp, "try-error")){
            wtail_lambda = (max(accum_hazard))^(1/wtail_alpha)/T_n0
            surv_baseline[tail_ind] = exp(-(wtail_lambda*time[tail_ind])^wtail_alpha)
          }
        }
        survprob = surv_baseline^exp(pmin(CAP, beta_w))
        pen = ifelse(penalty=="MCP", mcp_penalty(beta_p, gamma_lat, mu_lat),
                     scad_penalty(beta_p, gamma_lat, mu_lat))
        llp2_1 = sum(log(hazard[delta==1]) + beta_w[delta==1])
        - sum(exp_beta_w_p*accum_hazard) - N*pen
      }
      lik_lat = c(lik_lat, llp2_1)

      ## record updated parameters
      itct_path = c(itct_path, itct)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      b_p_path = rbind(b_p_path, b_p)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)
      beta_p_path = rbind(beta_p_path, beta_p)
      if (!conv1 & abs(llp1_1- llp1_0)< tol) conv1 <- TRUE
      if (!conv2 & abs(llp2_1- llp2_0)< tol) conv2 <- TRUE
      if (step > 1 & (conv1 & conv2) | step >= nIter) { #
        break
      }
      llp1_0 <- llp1_1
      llp2_0 <- llp2_1
      step <- 1 + step
    }

    ######## output ########
    output <- list(b_p_path = b_p_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_p_path = beta_p_path, beta_u_path = beta_u_path,
                   lik_inc = lik_inc, lik_lat = lik_lat)
    output
  }

cure.em <-
  function(X_u=NULL, X_p, W_u=NULL, W_p, time, event, model="cox", penalty="lasso",
           penalty.factor.inc=rep(1,ncol(x.inc)), penalty.factor.lat=rep(1,ncol(x.lat)),
           thresh=1e-03, maxit=ifelse(penalty=="lasso", 100, 1000),
           lambda.inc=0.1, lambda.lat=0.1, gamma.inc=3, gamma.lat=3, inits=NULL){
    x.inc <- cbind(X_u, X_p)
    x.lat <- cbind(W_u, W_p)
    if (is.null(dim(x.inc)) | is.null(dim(x.lat)))
      stop("x.inc and x.lat should be matrices with 2 or more columns")
    if((ncol(x.inc) <= 1) | (ncol(x.lat) <= 1))
      stop("x.inc and x.lat should be matrices with 2 or more columns")
    if (nrow(x.inc) != nrow(x.lat) | nrow(x.lat) != length(time) | length(time)!= length(event))
      stop("Input dimension mismatch")
    if(class(x.inc)[1] == "data.frame" | class(x.lat)[1] == "data.frame"){
      x.inc = as.matrix(x.inc); x.lat = as.matrix(x.lat)
    }
    if(!model%in%c("cox","weibull","exponential"))
      stop("Only 'cox', 'weibull', 'exponential' available for 'model' parameter")
    if(!penalty%in%c("lasso","MCP","SCAD"))
      stop("Only 'lasso', 'MCP', 'SCAD' available for 'penalty' parameter")
    if(any(!c(penalty.factor.inc, penalty.factor.lat)%in%c(0,1)))
      stop("Penalty factors can only contain 0 or 1")
    if(any(c(lambda.inc, lambda.lat, gamma.inc, gamma.lat)<=0))
      stop("Penalty pamameters lambda and gamma should be positive")
    if(!is.null(inits))
      inits = inits_check(model, N=length(time), penalty.factor.inc, penalty.factor.lat, inits)
    if(model != "cox" & penalty != "lasso")    penalty = "lasso"
    if(model=="cox" & penalty=="lasso")
      fit = cox_l1(X_u, X_p, W_u, W_p, time, event, lambda.inc, lambda.lat, inits, maxit, thresh)
    else if(model=="cox" & penalty %in% c("MCP","SCAD"))
      fit = cox_mcp_scad(X_u, X_p, W_u, W_p, time, event, penalty, lambda.inc, lambda.lat,
                         gamma.inc, gamma.lat, inits, maxit, thresh)
    else if(model=="weibull")
      fit = weib_EM(X_u, X_p, W_u, W_p, time, event, lambda.inc, lambda.lat, inits, maxit, thresh)
    else if(model=="exponential")
      fit = exp_EM(X_u, X_p, W_u, W_p, time, event, lambda.inc, lambda.lat, inits, maxit, thresh)
    model_select = which.max(fit$lik_inc+fit$lik_lat)
    b = rep(NA, ncol(x.inc))
    beta = rep(NA, ncol(x.lat))
    b[penalty.factor.inc==0] = fit$b_u_path[model_select,]
    b[penalty.factor.inc==1] = fit$b_p_path[model_select,]
    beta[penalty.factor.lat==0] = fit$beta_u_path[model_select,]
    beta[penalty.factor.lat==1] = fit$beta_p_path[model_select,]
    output = list(b0=fit$itct_path[model_select], b=b, beta=beta)
    if(model %in% c("exponential","weibull"))
      output$rate = fit$lambda_path[model_select]
    if(model == "weibull")
      output$alpha = fit$alpha_path[model_select]
    output$logLik.inc = fit$lik_inc[model_select]
    output$logLik.lat = fit$lik_lat[model_select]
    return(output)
  }

cv.em.fdr <-
  function(X_u, X_p, W_u, W_p, time, delta, model=c("weibull","exponential"),
           penalty=c("lasso","MCP","SCAD"), fdr=0.2, thresh=1e-3,
           nIter=ifelse(penalty=="lasso", 100, 1000), penalty.factor.inc, penalty.factor.lat,
           grid.tuning = FALSE, lambda.inc.list=NULL, lambda.lat.list=NULL,
           nlambda.inc = ifelse(grid.tuning, 10, 50),
           nlambda.lat = ifelse(grid.tuning, 10, 50),
           lambda.min.ratio.inc = 0.1, lambda.min.ratio.lat = 0.1,
           gamma.inc=3, gamma.lat=3,
           inits=NULL,
           n_folds=5, measure.inc=c("c","auc"), one.se=FALSE, cure_cutoff=5,
           parallel=FALSE, seed=NULL, verbose=TRUE){
    if(!is.null(seed)) set.seed(seed)
    X_k = knockoff::create.second_order(X_p, method = "asdp", shrink = T)
    W_k = knockoff::create.second_order(W_p, method = "asdp", shrink = T)
    Xaug = cbind(X_p, X_k)
    Waug = cbind(W_p, W_k)
    fit = cv.em.nofdr(X_u, Xaug, W_u, Waug, time, delta, model, penalty, thresh, nIter,
                      grid.tuning, lambda.inc.list, lambda.lat.list, nlambda.inc, nlambda.lat,
                      lambda.min.ratio.inc, lambda.min.ratio.lat, gamma.inc, gamma.lat, inits,
                      n_folds, measure.inc, one.se, cure_cutoff, parallel, seed, verbose)
    if(is.null(X_u)) b_p = fit$b else {
      b_u = fit$b[1:ncol(X_u)]
      b_p = fit$b[(ncol(X_u)+1):(length(fit$b))]
    }
    if(is.null(W_u)) beta_p = fit$beta else {
      beta_u = fit$beta[1:ncol(W_u)]
      beta_p = fit$beta[(ncol(W_u)+1):(length(fit$beta))]
    }
    Z_b = abs(b_p)
    Z_beta = abs(beta_p)
    J = ncol(X_p)
    M = ncol(W_p)
    orig_b = 1:J
    orig_beta = 1:M
    W_b = Z_b[orig_b] - Z_b[orig_b+J]
    W_beta = Z_beta[orig_beta] - Z_beta[orig_beta+M]
    T_b = knockoff::knockoff.threshold(W_b, fdr = fdr, offset = 1)
    T_beta = knockoff::knockoff.threshold(W_beta, fdr = fdr, offset = 1)
    selected_b = (1:J)[W_b > T_b]
    selected_beta = (1:M)[W_beta > T_beta]
    # KJA 05-09 added b_u code above and this code to output b0, b, beta, rate, and alpha
    if (identical(unname(selected_b), integer(0))) {
      b_p[1:J] <- 0
    } else {
      b_p[-selected_b] <- 0
    }
    if (identical(unname(selected_beta), integer(0))) {
      beta_p[1:M] <- 0
    } else {
      beta_p[-selected_beta] <- 0
    }
    if (is.null(X_u)) {
      b = b_p[1:J]
    } else {
       b = c(b_u, b_p[1:J])
    }
    if (is.null(W_u)) {
      beta = beta_p[1:M]
    } else {
       beta = c(beta_u, beta_p[1:M])
    }
    return(list(b0 = fit$b0, b = b, beta = beta, rate = fit$rate, alpha = fit$alpha, selected_b=selected_b, selected_beta=selected_beta))
  }

cv.em.inner <-
  function(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
           model=c("cox","weibull","exponential"), penalty=c("lasso","MCP","SCAD"),
           thresh=1e-5, nIter=1e4,
           grid.tuning = FALSE, tuning_sequence = NULL,
           gamma.inc=3, gamma.lat=3, inits=NULL,
           measure.inc=c("c","auc"), cure_cutoff=5){
    test_i = which(folds_i == k)
    test_delta = delta[test_i]
    test_time = time[test_i]
    test_X_p = X_p[test_i,,drop=FALSE]
    test_W_p = W_p[test_i,,drop=FALSE]
    if(is.null(X_u)) test_X_u = NULL else test_X_u = X_u[test_i,,drop=FALSE]
    if(is.null(W_u)) test_W_u = NULL else test_W_u = W_u[test_i,,drop=FALSE]
    if(is.null(X_u)) X_u_train = NULL else X_u_train = X_u[-test_i,,drop=FALSE]
    if(is.null(W_u)) W_u_train = NULL else W_u_train = W_u[-test_i,,drop=FALSE]
    if(is.null(X_u)) penalty.factor.inc = rep(1,ncol(X_p))
    else penalty.factor.inc = rep(0:1,c(ncol(X_u), ncol(X_p)))
    if(is.null(W_u)) penalty.factor.lat = rep(1,ncol(W_p))
    else penalty.factor.lat = rep(0:1,c(ncol(W_u), ncol(W_p)))
    initial_val = inits
    if(model=="cox") initial_val$survprob = initial_val$survprob[-test_i]
    cst = rep(NA, nrow(tuning_sequence))
    if(measure.inc=="auc") auc = rep(NA, nrow(tuning_sequence))
    for (i in 1:nrow(tuning_sequence)){
      lambda.inc = tuning_sequence[i,1]
      if(grid.tuning) lambda.lat = tuning_sequence[i,2] else lambda.lat = lambda.inc
      train_out = cure.em(X_u_train, X_p[-test_i,], W_u_train, W_p[-test_i,],
                          time[-test_i], delta[-test_i], model, penalty,
                          penalty.factor.inc, penalty.factor.lat, thresh, nIter,
                          lambda.inc, lambda.lat, gamma.inc, gamma.lat, initial_val)

      #train_out = cure.em(cbind(X_u_train, X_p[-test_i,]), cbind(W_u_train, W_p[-test_i,]),
      #                    time[-test_i], delta[-test_i], model, penalty,
      #                    penalty.factor.inc, penalty.factor.lat, thresh, nIter,
      #                    lambda.inc, lambda.lat, gamma.inc, gamma.lat, initial_val)
      if(is.null(X_u)) {b_u = NULL; b_p = train_out$b}
      else {b_u = train_out$b[1:ncol(X_u)]; b_p = train_out$b[(ncol(X_u)+1):(length(train_out$b))]}
      if(is.null(W_u)) {beta_u = NULL; beta_p = train_out$beta}
      else {beta_u = train_out$beta[1:ncol(W_u)]; beta_p = train_out$beta[(ncol(W_u)+1):(length(train_out$beta))]}
      cst[i] = C.stat(cure_cutoff, b_u, train_out$b0, b_p, beta_u, beta_p,
                      test_delta, test_time, test_X_u, test_X_p, test_W_u, test_W_p)
      if(measure.inc=="auc") auc[i] = AUC_msi(cure_cutoff = cure_cutoff, b_u, train_out$b0, b_p,
                                              test_delta, test_time, test_X_u, test_X_p)
    }
    if(measure.inc=="auc"){
      res = rbind(auc, cst)
      rownames(res) = c("AUC","Cstat")
    }
    else res = cst
    return(res)
  }

cv.em.nofdr <-
  function(X_u, X_p, W_u, W_p, time, delta, model=c("cox","weibull","exponential"),
           penalty=c("lasso","MCP","SCAD"),
           thresh=1e-03, nIter=ifelse(penalty=="lasso", 100, 1000),
           grid.tuning = FALSE, lambda.inc.list=NULL, lambda.lat.list=NULL,
           nlambda.inc = ifelse(grid.tuning, 10, 50),
           nlambda.lat = ifelse(grid.tuning, 10, 50),
           lambda.min.ratio.inc = 0.1, lambda.min.ratio.lat = 0.1,
           gamma.inc=3, gamma.lat=3,
           inits=NULL,
           n_folds=5, measure.inc=c("c","auc"), one.se=FALSE, cure_cutoff=5,
           parallel=FALSE, seed=NULL, verbose){
    if(!is.null(seed)) set.seed(seed)
    if(!grid.tuning){
      if((!is.null(lambda.inc.list) & !is.null(lambda.lat.list) &
          !identical(lambda.inc.list, lambda.lat.list)) | (nlambda.inc!=nlambda.lat) |
         (lambda.min.ratio.inc!=lambda.min.ratio.lat))
        warning("Grid tuning is off. Same lambda sequence for incidence and latency was used.")
      if(length(lambda.inc.list)>length(lambda.lat.list)) lambda.lat.list=lambda.inc.list
      else lambda.inc.list=lambda.lat.list
      nlambda.inc <- nlambda.lat <- max(nlambda.inc, nlambda.lat)
      lambda.min.ratio.inc <- lambda.min.ratio.lat <- min(lambda.min.ratio.inc, lambda.min.ratio.lat)
    }

    # decide lambda grid or sequence for tuning
    N = length(time)
    if(is.null(inits)){
      if(model=="cox") inits = initialization_cox(X_u, W_u, time, delta)
      else inits = initialization_parametric(X_u, W_u, time, delta, model)
    }
    if(grid.tuning){
      if(is.null(lambda.inc.list) | is.null(lambda.lat.list)){
        pir = rep(1, N)
        if(is.null(X_u)) pi_x = rep(mean(delta), N)
        else pi_x = 1/(1+exp(-inits$itct-X_u %*% inits$b_u))
        if(is.null(W_u)) beta_w = rep(0,N) else beta_w = W_u %*% inits$beta_u
        if(model=="cox") survprob = inits$survprob
        else if(model=="weibull")
          survprob = exp(-(inits$lambda * time)^(inits$alpha) * exp(beta_w))
        else if(model=="exponential") survprob = exp(-inits$lambda * time * exp(beta_w))
        numerator = pi_x[delta==0] * survprob[delta==0]
        pir[delta==0] = numerator/(1-pi_x[delta==0] + numerator)
        if(is.null(lambda.inc.list)){
          lambda.max.inc = max(abs(matrix(pir-0.5,1,N)%*%X_p))/N
          lambda.min.inc = lambda.max.inc * lambda.min.ratio.inc
          k1 = (0:(nlambda.inc-1)) / nlambda.inc
          lambda.inc.list = lambda.max.inc * lambda.min.ratio.inc^k1
        }
        if(is.null(lambda.lat.list)){
          nz_pir = which(pir>0)
          lambda.max.lat = get_cox_lambda_max(x = cbind(W_u, W_p)[nz_pir,],
                                              time = time[nz_pir], event = delta[nz_pir],
                                              offset = log(pir[nz_pir]))
          lambda.min.lat = lambda.max.lat * lambda.min.ratio.lat
          k2 = (0:(nlambda.lat-1)) / nlambda.lat
          lambda.lat.list = lambda.max.lat * lambda.min.ratio.lat^k2
        }
      }
      lambda.grid = expand.grid(lambda.inc.list, lambda.lat.list)
    }else{ # use same sequence to tune lambda.inc and lambda.lat
      if(is.null(lambda.inc.list)){
        lambda_max = max(colSums(abs(X_p)), colSums(abs(W_p)))/(2*N)
        lambda_min = lambda_max * lambda.min.ratio.inc
        k1 = (0:(nlambda.inc-1)) / nlambda.inc
        lambda.list = lambda_max * lambda.min.ratio.inc^k1
      }else lambda.list = lambda.inc.list
    }
    if(grid.tuning) tuning_sequence = lambda.grid
    else tuning_sequence = matrix(lambda.list, ncol = 1)

    # cross-validation
    folds_i = sample(rep(1:n_folds, length.out = N))
    Cstat = matrix(NA, nrow(tuning_sequence), n_folds)
    if(measure.inc=="auc") AUC = matrix(NA, nrow(tuning_sequence), n_folds)
    if(parallel){ # parallel computing
      ncores = n_folds
      #registerDoMC(ncores)
      cl <- parallel::makeCluster(detectCores())
      doParallel::registerDoParallel(cl)
      res_list = foreach(k = 1:n_folds) %dopar% {
        res = cv.em.inner(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
                          model,penalty,thresh, nIter, grid.tuning, tuning_sequence,
                          gamma.inc, gamma.lat, inits,
                          measure.inc, cure_cutoff)
        return(res)
      }
      parallel::stopCluster(cl)
      if(measure.inc=="auc"){
        for (k in 1:n_folds){
          AUC[,k] = res_list[[k]][1,]
          Cstat[,k] = res_list[[k]][2,]
        }
      }else{
        for (k in 1:n_folds)  Cstat[,k] = res_list[[k]]
      }
    }else{ # no parallel computing
      for (k in 1:n_folds) {
        if(verbose) message("Fold ", k, " out of ", n_folds, " training...\n")
        res = cv.em.inner(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
                          model,penalty,thresh, nIter, grid.tuning, tuning_sequence,
                          gamma.inc, gamma.lat, inits,
                          measure.inc, cure_cutoff)
        if(measure.inc=="auc"){
          AUC[,k] = res[1,]
          Cstat[,k] = res[2,]
        }else Cstat[,k] = res
      }
    }

    if(measure.inc=="auc"){ # use AUC to tune incidence and C-statistic to tune latency
      auc = rowMeans(AUC, na.rm = T)
      c_stat = rowMeans(Cstat, na.rm = T)
      if(one.se){
        auc_sd = sqrt(apply(AUC, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        c_stat_sd = sqrt(apply(Cstat, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        opt_step_b = which.max(auc)
        opt_step_beta = which.max(c_stat)
        model_select_b = which(auc >= (auc[opt_step_b] - auc_sd[opt_step_b]))[1]
        model_select_beta = which(c_stat >= (c_stat[opt_step_beta] - c_stat_sd[opt_step_beta]))[1]
      }else{
        model_select_b = which.max(auc)
        model_select_beta = which.max(c_stat)
      }
    }
    else{ # use C-statistic to tune both components
      c_stat = rowMeans(Cstat, na.rm = T)
      if(one.se){
        c_stat_sd = sqrt(apply(Cstat, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        opt_step = which.max(c_stat)
        model_select <- model_select_b <- model_select_beta <-
          which(c_stat >= (c_stat[opt_step] - c_stat_sd[opt_step]))[1]
      }else model_select <- model_select_b <- model_select_beta <- which.max(c_stat)
    }
    optimal_lambda_b = tuning_sequence[model_select_b,1]
    if(grid.tuning) optimal_lambda_beta = tuning_sequence[model_select_beta,2]
    else optimal_lambda_beta = tuning_sequence[model_select_beta,1]
    if(verbose){
      message("Selected lambda for incidence: ",round(optimal_lambda_b,3),"\n")
      message("Selected lambda for latency: ", round(optimal_lambda_beta,3),"\n")
      message("Maximum C-statistic: ",max(c_stat),"\n")
      if(measure.inc=="auc") message("Maximum AUC: ",max(auc),"\n")
      if(verbose) message("Fitting a final model...\n")
    }
    if(is.null(X_u)) penalty.factor.inc = rep(1,ncol(X_p))
    else penalty.factor.inc = rep(0:1,c(ncol(X_u), ncol(X_p)))
    if(is.null(W_u)) penalty.factor.lat = rep(1,ncol(W_p))
    else penalty.factor.lat = rep(0:1,c(ncol(W_u), ncol(W_p)))
    output = cure.em(X_u, X_p, W_u, W_p,
                     time, delta, model, penalty,
                     penalty.factor.inc, penalty.factor.lat, thresh, nIter,
                     optimal_lambda_b, optimal_lambda_beta, gamma.inc, gamma.lat, inits)
    #output = cure.em(cbind(X_u, X_p), cbind(W_u, W_p),
    #                 time, delta, model, penalty,
    #                 penalty.factor.inc, penalty.factor.lat, thresh, nIter,
    #                 optimal_lambda_b, optimal_lambda_beta, gamma.inc, gamma.lat, inits)
    output$selected.lambda.inc=optimal_lambda_b
    output$selected.lambda.lat=optimal_lambda_beta
    output$max.c = max(c_stat)
    if(measure.inc=="auc") output$max.auc = max(auc)
    return(output)
  }

cv.gmifs.fdr <-
  function(X_u, X_p, W_u, W_p, time, delta, model=c("weibull","exponential"),
           fdr=0.2, thresh=1e-5, nIter=1e4, epsilon=0.001, inits=NULL,
           n_folds=5, measure.inc=c("c","auc"), one.se=FALSE, cure_cutoff=5,
           parallel=FALSE, seed=NULL, verbose=TRUE){
    if(!is.null(seed)) set.seed(seed)
    X_k = knockoff::create.second_order(X_p, method = "asdp", shrink = T)
    W_k = knockoff::create.second_order(W_p, method = "asdp", shrink = T)
    Xaug = cbind(X_p, X_k)
    Waug = cbind(W_p, W_k)
    fit = cv.gmifs.nofdr(X_u, Xaug, W_u, Waug, time, delta, model, thresh, nIter, epsilon, inits,
                         n_folds, measure.inc, one.se, cure_cutoff,
                         parallel, seed, verbose)
    b_p <- fit$b_p
    b_u <- fit$b_u
    beta_p <- fit$beta_p
    beta_u <- fit$beta_u
    Z_b = abs(fit$b_p)
    Z_beta = abs(fit$beta_p)
    J = ncol(X_p)
    M = ncol(W_p)
    orig_b = 1:J
    orig_beta = 1:M
    W_b = Z_b[orig_b] - Z_b[orig_b+J]
    W_beta = Z_beta[orig_beta] - Z_beta[orig_beta+M]
    T_b = knockoff::knockoff.threshold(W_b, fdr = fdr, offset = 1)
    T_beta = knockoff::knockoff.threshold(W_beta, fdr = fdr, offset = 1)
    selected_b = (1:J)[W_b > T_b]
    selected_beta = (1:M)[W_beta > T_beta]
    # KJA 05-09 added b_u code above and this code to output b0, b, beta, rate, and alpha
    b_p <- b_p[1:J]
    beta_p <- beta_p[1:M]
    if (identical(unname(selected_b), integer(0))) {
      b_p[1:J] <- 0
    } else {
      b_p[-selected_b] <- 0
    }
    if (identical(unname(selected_beta), integer(0))) {
      beta_p[1:M] <- 0
    } else {
      beta_p[-selected_beta] <- 0
    }
    return(list(b0 = fit$b0, b_u = b_u, b_p = b_p, beta_u = beta_u, beta_p = beta_p, rate = fit$rate, alpha = fit$alpha, selected_b=selected_b, selected_beta=selected_beta))
  }

cv.gmifs.inner <-
  function(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
           model=c("weibull","exponential"),thresh=1e-5, nIter=1e4, epsilon=0.001,
           inits=NULL, measure.inc=c("c","auc"), cure_cutoff=5, verbose=TRUE){
    test_i = which(folds_i == k)
    test_delta = delta[test_i]
    test_time = time[test_i]
    test_X_p = X_p[test_i,,drop=F]
    test_W_p = W_p[test_i,,drop=F]
    if(is.null(X_u)) test_X_u = NULL else test_X_u = X_u[test_i,,drop=F]
    if(is.null(W_u)) test_W_u = NULL else test_W_u = W_u[test_i,,drop=F]
    if(is.null(X_u)) X_u_train = NULL else X_u_train = X_u[-test_i,,drop=F]
    if(is.null(W_u)) W_u_train = NULL else W_u_train = W_u[-test_i,,drop=F]
    if(model=="exponential")
      train_out = exp_cure(X_u_train, X_p[-test_i,],
                           W_u_train, W_p[-test_i,],
                           time[-test_i], delta[-test_i],
                           epsilon, thresh, nIter, inits, verbose)
    else if(model=="weibull")
      train_out = weibull.cure(X_u_train, X_p[-test_i,],
                               W_u_train, W_p[-test_i,],
                               time[-test_i], delta[-test_i],
                               epsilon, thresh, nIter, inits, verbose)
    cst = sapply(1:length(train_out$itct_path),
                 function(x) {
                   if(is.null(X_u)) b_u = NULL else b_u = train_out$b_u_path[x,]
                   if(is.null(W_u)) beta_u = NULL else beta_u = train_out$beta_u_path[x,]
                   cs = C.stat(cure_cutoff=cure_cutoff,
                               b_u, train_out$itct_path[x], train_out$b_p_path[x,],
                               beta_u, train_out$beta_p_path[x,],
                               test_delta, test_time, test_X_u, test_X_p, test_W_u, test_W_p)
                   return(cs)
                 })
    if(measure.inc=="auc"){
      auc = sapply(1:length(train_out$itct_path),
                   function(x) {
                     if(is.null(X_u)) b_u = NULL else b_u = train_out$b_u_path[x,]
                     if(is.null(W_u)) beta_u = NULL else beta_u = train_out$beta_u_path[x,]
                     a = AUC_msi(cure_cutoff = cure_cutoff, b_u, train_out$itct_path[x],
                                 train_out$b_p_path[x,], test_delta, test_time, test_X_u, test_X_p)
                     return(a)
                   })
      res = rbind(c(auc, rep(NA, nIter-length(auc))), c(cst, rep(NA, nIter-length(cst))))
      rownames(res) = c("AUC","Cstat")
    }
    else res = c(cst, rep(NA, nIter-length(cst)))
    return(res)
  }

cv.gmifs.nofdr <-
  function(X_u, X_p, W_u, W_p, time, delta, model=c("weibull","exponential"),
           thresh=1e-5, nIter=1e4, epsilon=0.001, inits=NULL,
           n_folds=5, measure.inc=c("c","auc"), one.se=FALSE, cure_cutoff=5,
           parallel=FALSE, seed=NULL, verbose){
    if(!is.null(seed)) set.seed(seed)
    folds_i = sample(rep(1:n_folds, length.out = length(time)))
    Cstat = matrix(NA, nIter, n_folds)
    if(measure.inc=="auc") AUC = matrix(NA, nIter, n_folds)
    if(parallel){ # parallel computing
      ncores = n_folds
      #registerDoMC(ncores) # for doMC
      cl <- parallel::makeCluster(detectCores())
      doParallel::registerDoParallel(cl)
      res_list = foreach(k = 1:n_folds) %dopar% {
        res = cv.gmifs.inner(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
                             model,thresh, nIter, epsilon, inits, measure.inc, cure_cutoff, verbose)
        return(res)
      }
      parallel::stopCluster(cl)
      if(measure.inc=="auc"){
        for (k in 1:n_folds){
          AUC[,k] = res_list[[k]][1,]
          Cstat[,k] = res_list[[k]][2,]
        }
      }else{
        for (k in 1:n_folds)  Cstat[,k] = res_list[[k]]
      }
    }else{ # no parallel computing
      for (k in 1:n_folds) {
        if(verbose) message("Fold ", k, " out of ", n_folds, " training...\n")
        res = cv.gmifs.inner(X_u, X_p, W_u, W_p, time, delta, folds_i, k,
                             model,thresh, nIter, epsilon, inits, measure.inc, cure_cutoff, verbose)
        if(measure.inc=="auc"){
          AUC[,k] = res[1,]
          Cstat[,k] = res[2,]
        }else Cstat[,k] = res
      }
    }
    if(measure.inc=="auc"){ # use AUC to tune incidence and C-statistic to tune latency
      auc = rowMeans(AUC, na.rm = T)
      c_stat = rowMeans(Cstat, na.rm = T)
      if(one.se){
        auc_sd = sqrt(apply(AUC, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        c_stat_sd = sqrt(apply(Cstat, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        opt_step_b = which.max(auc)
        opt_step_beta = which.max(c_stat)
        model_select_b = which(auc >= (auc[opt_step_b] - auc_sd[opt_step_b]))[1]
        model_select_beta = which(c_stat >= (c_stat[opt_step_beta] - c_stat_sd[opt_step_beta]))[1]
      }else{
        model_select_b = which.max(auc)
        model_select_beta = which.max(c_stat)
      }
      if(verbose){
        message("Selected step for incidence: ",model_select_b,"\n")
        message("Selected step for latency: ",model_select_beta,"\n")
        message("Maximum AUC: ",max(auc, na.rm = T),"\n")
        message("Maximum C-statistic: ",max(c_stat, na.rm = T),"\n")
        message("Fitting a final model...\n")
      }
      if(model=="exponential")
        fit = exp_cure(X_u, X_p, W_u, W_p, time, delta, epsilon, thresh,
                       nIter=max(model_select_b, model_select_beta), inits, verbose)
      else if(model=="weibull")
        fit = weibull.cure(X_u, X_p, W_u, W_p, time, delta, epsilon, thresh,
                           nIter=max(model_select_b, model_select_beta), inits, verbose)
    }
    else{ # use C-statistic to tune both components
      c_stat = rowMeans(Cstat, na.rm = T)
      if(one.se){
        c_stat_sd = sqrt(apply(Cstat, 1, var, na.rm = T)*(n_folds-1)/n_folds/(nrow(X_p)-1))
        opt_step = which.max(c_stat)
        model_select <- model_select_b <- model_select_beta <-
          which(c_stat >= (c_stat[opt_step] - c_stat_sd[opt_step]))[1]
      }else model_select <- model_select_b <- model_select_beta <- which.max(c_stat)
      if(verbose){
        message("Selected step: ",model_select,"\n")
        message("Maximum C-statistic: ",max(c_stat, na.rm = T),"\n")
        message("Fitting a final model...\n")
      }
      if(model=="exponential")
        fit = exp_cure(X_u, X_p, W_u, W_p, time, delta, epsilon, thresh, nIter=model_select, inits,
                       verbose)
      else if(model=="weibull")
        fit = weibull.cure(X_u, X_p, W_u, W_p, time, delta, epsilon, thresh, nIter=model_select,
                           inits, verbose)
    }
    nstep = length(fit$logLikelihood)
    b_p = fit$b_p_path[nstep,]
    b_u = fit$b_u_path[nstep,]
    b0 = fit$itct_path[nstep]
    beta_p = fit$beta_p_path[nstep,]
    beta_u = fit$beta_u_path[nstep,]
    rate = fit$lambda_path[nstep]
    if(model=="weibull") alpha = fit$alpha_path[nstep]
    else alpha = 1
    output = list(model.select.inc=model_select_b, model.select.lat=model_select_beta,
                  b0=b0, b_u=b_u, b_p=b_p, beta_u=beta_u, beta_p=beta_p, alpha=alpha, rate=rate,
                  logLik = fit$logLikelihood[nstep], max.c = max(c_stat, na.rm = T))
    if(measure.inc=="auc") output$max.auc = max(auc, na.rm = T)
    return(output)
  }

exp_cure <-
  function(X_u=NULL, X_p, W_u=NULL, W_p, time, delta, epsilon = 0.001, tol = 1e-05,
           nIter=1e4, inits=NULL, verbose)
  {

    # X_u: N by nonp, non-penalized covariate matrix associated with incidence
    # X_p: N by J, penalized covariate matrix associated with incidence
    # W_u: N by nonp, non-penalized covariate matrix associated with latency
    # W_p:  N by M, penalized covariate matrix associated with incidence
    # time: vector of length N, observed survival time
    # delta: vector of length N, censoring status (not censored = 0)
    # epsilon: incremental size
    # tol: difference between log-likelihood

    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates
    X_p = cbind(X_p, -X_p) # N by 2J
    W_p = cbind(W_p, -W_p) # N by 2M

    ######## initialization ########
    step = 1
    b_p = rep(0, 2*J) # penalized incidence
    beta_p = rep(0, 2*M) # penalized latency
    if(is.null(inits)) inits = initialization_parametric(X_u, W_u, time, delta, model="weibull")
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u; lambda = inits$lambda
    log_lambda = log(max(lambda, 1e-15))
    LL0 = 0

    b_p_path <- beta_p_path <- lambda_path <- b_u_path <- beta_u_path <- itct_path <- NULL
    logLikelihood <- numeric()

    ######## loop ########

    repeat{

      #### update penalized parameters
      upd = exp_update(lambda, b_p, beta_p, b_u, itct, beta_u, X_u , X_p, W_u, W_p, time, delta, epsilon)
      b_p = upd$b_p
      beta_p = upd$beta_p
      b_p_path = rbind(b_p_path, b_p)
      beta_p_path = rbind(beta_p_path, beta_p)

      #### update other parameters
      out = optim(par = c(log_lambda, b_u, itct, beta_u), fn = exp_negloglik, gr = exp_gradient,
                  b_p = b_p, beta_p = beta_p, X_u = X_u, X_p = X_p, W_u = W_u, W_p = W_p,
                  time = time, delta = delta, method="BFGS")
      log_lambda = out$par[1]
      lambda = exp(log_lambda)
      if(!is.null(X_u)) {
        b_u = out$par[2:(ncol(X_u)+1)] # unpenalized incidence
        itct = out$par[ncol(X_u)+2]
        if(!is.null(W_u)) beta_u = out$par[(ncol(X_u)+3):(ncol(X_u)+ncol(W_u)+2)] # unpenalized latency
      } else{
        itct = out$par[2]
        if(!is.null(W_u)) beta_u = out$par[3:(ncol(W_u)+2)] # unpenalized latency
      }

      lambda_path = c(lambda_path, lambda)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      itct_path = c(itct_path, itct)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)

      LL1 = -out$value
      logLikelihood = c(logLikelihood, LL1)

      if(verbose & step%%1000==0) message("step = ", step, "\n")
      if (step > 1 && (abs(LL1 - LL0) < tol | step >= nIter)) { #
        break
      }
      LL0 <- LL1
      step <- 1 + step
    }

    ######## output ########
    b_p_path = b_p_path[,1:J] - b_p_path[,(J+1):(2*J)]
    beta_p_path = beta_p_path[,1:M] - beta_p_path[,(M+1):(2*M)]
    output <- list(b_p_path = b_p_path, beta_p_path = beta_p_path,
                   lambda_path = lambda_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_u_path = beta_u_path, logLikelihood = logLikelihood)
    output
  }


exp_EM <-
  function(X_u=NULL, X_p, W_u=NULL, W_p, time, delta, mu_inc, mu_lat,
           inits=NULL, nIter = 100, tol = 1e-4){
    # mu: penalty parameter
    # tol: difference between log-likelihood
    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates

    ######## initialization ########
    step = 1
    b_p <- rep(0,J)
    beta_p <- rep(0,M)
    b_p_ext <- rep(0,2*J)
    beta_p_ext <- rep(0,2*M)
    if(is.null(inits)) inits = initialization_parametric(X_u, W_u, time, delta, model="weibull")
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u; lambda = inits$lambda
    log_lambda = log(max(lambda, 1e-15))
    pir = rep(1, N)
    llp1_0 <- llp2_0 <-0
    lik_inc <- lik_lat <- c()
    b_p_path <- beta_p_path <- b_u_path <- beta_u_path <- itct_path <- lambda_path <- NULL
    conv1 <- conv2 <- FALSE

    ######## loop ########

    repeat{

      #### E-step
      if(is.null(X_u)) b_x = itct + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
      else b_x = itct + X_u %*% b_u + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
      if(is.null(W_u)) beta_w = W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
      else beta_w = W_u %*% beta_u + W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
      pir[delta==0] = 1/(1+exp(-b_x[delta==0]+lambda * time[delta==0] * exp(beta_w[delta==0])))
      #### update penalized parameters
      if (!conv1){
        b_p_ext = optim(par = b_p_ext, fn = l1_negloglik_inc_b, gr = l1_grad_b,
                        itct = itct, b_u = b_u, X_u = X_u, X_p = X_p,
                        pir = pir, mu = mu_inc, method = 'L-BFGS-B', lower = rep(0, 2*J))$par
        b_p = b_p_ext[1:J]-b_p_ext[(J+1):(2*J)]
      }

      if (!conv2){
        beta_p_ext = optim(par = beta_p_ext, fn = exp_negloglik_lat_beta, gr = exp_grad_beta,
                           lambda = lambda,
                           beta_u = beta_u, W_u = W_u, W_p = W_p, time = time, delta = delta, pir = pir, mu = mu_lat,
                           method = 'L-BFGS-B', lower = rep(0, 2*M))$par
        beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
      }

      #### update nonpenalized parameters
      if (!conv1){
        out_inc = optim(par = c(itct,b_u), fn = l1_negloglik_inc, gr = l1_gradient_inc,
                        b_p = b_p, X_u =X_u, X_p = X_p, pir = pir, mu = mu_inc, method="BFGS") # incidence
        itct = out_inc$par[1]
        if(!is.null(X_u)) b_u = out_inc$par[2:(ncol(X_u)+1)]
        llp1_1 = -out_inc$value
      }
      lik_inc = c(lik_inc, llp1_1)

      if (!conv2){
        out_lat = optim(par = c(log_lambda, beta_u), fn = exp_negloglik_lat, gr = exp_gradient_lat,
                        beta_p=beta_p, W_u=W_u, W_p=W_p, time=time, delta=delta, pir=pir, mu=mu_lat,
                        method="BFGS") # latency
        log_lambda = out_lat$par[1]
        lambda = exp(log_lambda)
        if(!is.null(W_u)) beta_u = out_lat$par[2:(ncol(W_u)+1)] # unpenalized incidence
        llp2_1 = -out_lat$value
      }
      lik_lat = c(lik_lat, llp2_1)

      ## record updated parameters
      itct_path = c(itct_path, itct)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      b_p_path = rbind(b_p_path, b_p)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)
      beta_p_path = rbind(beta_p_path, beta_p)
      lambda_path = c(lambda_path, lambda)
      if (!conv1 & abs(llp1_1- llp1_0)< tol) conv1 <- TRUE
      if (!conv2 & abs(llp2_1- llp2_0)< tol) conv2 <- TRUE
      if (step > 1 & (conv1 & conv2) | step >= nIter) { #
        break
      }
      llp1_0 <- llp1_1
      llp2_0 <- llp2_1
      step <- 1 + step
    }

    ######## output ########
    output <- list(b_p_path = b_p_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_p_path = beta_p_path, beta_u_path = beta_u_path, lambda_path = lambda_path,
                   lik_inc = lik_inc, lik_lat = lik_lat)
    output
  }

exp_grad_beta <-
  function(theta, lambda, beta_u, W_u, W_p, time, delta, pir, mu){
    N = nrow(W_p)
    M = ncol(W_p)
    beta_p_ext = theta
    beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = pir*lambda*time*exp(betaw)
    grad = matrix(delta-temp1,1)%*% W_p
    return(c(-grad+ N*mu, grad+ N*mu))
  }

exp_gradient_lat <-
  function(theta, beta_p, W_u, W_p, time, delta, pir, mu){ # latency
    log_lambda = theta[1]
    lambda = exp(log_lambda)
    if(!is.null(W_u)) beta_u = theta[2:(ncol(W_u)+1)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = pir*lambda*time*exp(betaw)
    temp2 = delta-temp1
    grad2 = sum(temp2)
    if(!is.null(W_u)) grad3 = matrix(temp2,1) %*% W_u else grad3=NULL
    return(-c(grad2, grad3))
  }

exp_gradient <-
  function(theta, b_p, beta_p, X_u , X_p, W_u, W_p, time, delta)
  {
    N = length(time)
    lambda = exp(theta[1])
    if(!is.null(X_u)) {
      b_u = theta[2:(ncol(X_u)+1)] # unpenalized incidence
      itct = theta[ncol(X_u)+2]
      if(!is.null(W_u)) beta_u = theta[(ncol(X_u)+3):(ncol(X_u)+ncol(W_u)+2)] # unpenalized latency
    } else{
      itct = theta[2]
      if(!is.null(W_u)) beta_u = theta[3:(ncol(W_u)+2)] # unpenalized latency
    }
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) C_b = exp(itct + X_u %*% b_u + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    else C_b = exp(itct + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    if(!is.null(W_u)) C_beta = exp(W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    else C_beta = exp(W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    C_lbeta = exp(-lambda * time * C_beta)
    temp1 = log (pmax(lambda*time,rep(1e-100,N)))
    temp2 = lambda * time * C_beta
    temp3 = 1 / (1 + C_b*C_lbeta)
    temp4 = (1-delta) * C_b * C_lbeta * temp2 * temp3
    temp5 = delta * (1-temp2)
    temp6 = 1/(1+C_b)
    temp7 = temp6*(delta - (1-delta) * C_b * (1-C_lbeta) * temp3)
    grad2 = sum(temp5 - temp4)
    if(!is.null(X_u)) grad3 = matrix(temp7,1) %*% X_u else grad3=NULL
    grad4 = sum(temp7) # intercept
    if(!is.null(W_u)) grad5 = matrix(temp5 - temp4, 1) %*% W_u else grad5=NULL
    return(-c(grad2, grad3, grad4, grad5))
  }

exp_negloglik_lat_beta <-
  function(theta, lambda, beta_u, W_u, W_p, time, delta, pir, mu){ # latency
    N = nrow(W_p)
    M = ncol(W_p)
    beta_p_ext = theta
    beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    ll1 = delta*( log(lambda)+betaw )
    ll2 = -pir*lambda*time*exp(betaw)
    return(-sum(ll1+ll2)+N*mu*sum(beta_p_ext))
  }

exp_negloglik_lat <-
  function(theta, beta_p, W_u, W_p, time, delta, pir, mu){ # latency
    N = nrow(W_p)
    log_lambda = theta[1]
    lambda = exp(log_lambda)
    if(!is.null(W_u)) beta_u = theta[2:(ncol(W_u)+1)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    ll1 = delta*( log_lambda +betaw )
    ll2 = -pir*lambda*time*exp(betaw)
    return(-sum(ll1+ll2)+N*mu*sum(abs(beta_p)))
  }

exp_negloglik <-
  function(theta, b_p, beta_p, X_u , X_p, W_u, W_p, time, delta)
  {
    N = length(time)
    lambda = exp(theta[1])
    if(!is.null(X_u)) {
      b_u = theta[2:(ncol(X_u)+1)] # unpenalized incidence
      itct = theta[ncol(X_u)+2]
      if(!is.null(W_u)) beta_u = theta[(ncol(X_u)+3):(ncol(X_u)+ncol(W_u)+2)] # unpenalized latency
    } else{
      itct = theta[2]
      if(!is.null(W_u)) beta_u = theta[3:(ncol(W_u)+2)] # unpenalized latency
    }
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) logC_b = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else logC_b = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    if(!is.null(W_u)) logC_beta = W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero]
    else logC_beta = W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = 1/(1+exp(-logC_b)) # exp(logC_b) / (1+exp(logC_b)) #Mar17
    logC_lbeta = - lambda * time * exp(logC_beta)
    ll1 = delta * (log(temp1) + theta[1] + logC_beta + logC_lbeta)
    ll2 = (1-delta) * log(pmax(1 - temp1 * (1-exp(logC_lbeta)),rep(1e-100,N)))
    return(-sum(ll1+ll2))
  }

exp_update <-
  function(lambda, b_p, beta_p, b_u, itct, beta_u, X_u , X_p, W_u, W_p, time, delta, epsilon)
  {
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) C_b = exp(itct + X_u %*% b_u + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    else C_b = exp(itct + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    if(!is.null(W_u)) C_beta = exp(W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    else C_beta = exp(W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    C_lbeta = exp(-lambda * time * C_beta)
    temp2 = lambda * time * C_beta
    temp3 = 1 / (1 + C_b*C_lbeta)
    temp4 = (1-delta) * C_b * C_lbeta * temp2 * temp3
    temp5 = delta * (1-temp2)
    temp6 = 1/(1+C_b)
    ### update b_p
    grad_b = matrix(temp6*(delta - (1-delta) * C_b * (1-C_lbeta) * temp3),1) %*% X_p # length J
    j_b = which.max(grad_b)
    b_p[j_b] = b_p[j_b] + epsilon
    ### update beta_p
    grad_beta = matrix(temp5 - temp4,1) %*% W_p # length M
    j_beta = which.max(grad_beta)
    beta_p[j_beta] = beta_p[j_beta] + epsilon

    return(list(b_p = b_p, beta_p = beta_p))
  }

initialization_cox <-
  function(X_u=NULL, W_u=NULL, time, delta){
    data_init = as.data.frame(cbind(time, delta, X_u, W_u))
    if(is.null(X_u) & is.null(W_u)){
      fit = glm(delta~1, family="binomial")
      surv = survival::survfit(Surv(time, delta)~1)
      survprob = sapply(time, function(x) summary(surv, times = x, extend=T)$surv)
      inits = list(itct=coef(fit), b_u=NULL, beta_u=NULL, survprob=survprob)
    } else if(is.null(X_u)){
      colnames(data_init) <- c("time","delta",paste0("lat",1:ncol(W_u)))
      formula_lat = as.formula(paste("Surv(time, delta) ~", paste(paste0("lat",1:ncol(W_u)),collapse=" + ")))
      fit1 = survival::coxph(formula_lat, data = data_init)
      surv = survival::survfit(fit1)
      survprob = sapply(time, function(x) summary(surv, times = x, extend=T)$surv)
      fit2 = glm(delta~1, family="binomial")
      inits = list(itct=coef(fit2), b_u=NULL, beta_u=coef(fit1), survprob=survprob)
    } else if(is.null(W_u)){
      colnames(data_init) <- c("time","delta",paste0("inc",1:ncol(X_u)))
      formula_inc = as.formula(paste("delta ~", paste(paste0("inc",1:ncol(X_u)),collapse=" + ")))
      fit = glm(formula_inc, family="binomial", data = data_init)
      surv = survival::survfit(Surv(time, delta)~1)
      survprob = sapply(time, function(x) summary(surv, times = x, extend=T)$surv)
      itct = coef(fit)[1]
      b_u = coef(fit)[-1]
      inits = list(itct=itct, b_u=b_u, beta_u=NULL, survprob=survprob)
    } else{
      colnames(data_init) <- c("time","delta",paste0("inc",1:ncol(X_u)), paste0("lat",1:ncol(W_u)))
      formula_lat = as.formula(paste("Surv(time, delta) ~", paste(paste0("lat",1:ncol(W_u)),collapse=" + ")))
      formula_inc = as.formula(paste("delta ~", paste(paste0("inc",1:ncol(X_u)),collapse=" + ")))
      fit1 = glm(formula_inc, family="binomial", data = data_init)
      itct = coef(fit1)[1]
      b_u = coef(fit1)[-1]
      fit2 = survival::coxph(formula_lat, data = data_init)
      beta_u=coef(fit2)
      surv = survival::survfit(fit2)
      survprob = sapply(time, function(x) summary(surv, times = x, extend=T)$surv)
      inits = list(itct=itct, b_u=b_u, beta_u=beta_u, survprob=survprob)
    }
    return(inits)
  }

get_cox_lambda_max <- function (x, time, event, offset = rep(0, nrow(x)))
{
  N <- nrow(x)
  beta_w <- offset
  beta_w <- beta_w - mean(beta_w)

  event_time = time[event==1]
  uniq_event_time = unique(event_time)
  n0 = length(uniq_event_time)
  I0 = matrix(time, ncol = 1) %*% matrix(1, 1, n0) >=
    matrix(1, N, 1) %*% matrix(uniq_event_time, nrow = 1)    # N*n0
  d_j = as.numeric(table(event_time)[rank(uniq_event_time)])

  exp_beta_w = exp(beta_w)
  sum0 = pmax(1e-50, matrix(exp_beta_w,1) %*% I0)   # 1*n0
  sum1 = t(x) %*% diag(as.numeric(exp_beta_w)) %*% I0   # ncol(x) * n0
  null_grad = colSums(x[event==1,,drop=FALSE])- rowSums(sum1 %*% diag(as.numeric(d_j/sum0))) # ncol(x) * 1
  g <- abs(null_grad)
  lambda_max <- max(g)/N
  return(lambda_max)
}

initialization_parametric <-
  function(X_u=NULL, W_u=NULL, time, delta, model="weibull"){
    if(!is.null(X_u)){
      fit1 = glm(as.formula(paste("delta ~", paste(colnames(as.data.frame(X_u)),collapse=" + "))),
                 data =as.data.frame(X_u), family = binomial(link="logit"))
      b_u = fit1$coefficients[-1]
      itct = fit1$coefficients[1]
    } else{
      itct = log(mean(delta)/(1-mean(delta)))
      b_u = NULL
    }
    if(!is.null(W_u)) {
      fit2 = survival::coxph(as.formula(paste("Surv(time, delta) ~", paste(colnames(as.data.frame(W_u)),collapse=" + "))),
                             data =as.data.frame(W_u),ties = "breslow")
      beta_u <- coef(fit2)
    } else beta_u = NULL
    if(model=="weibull"){
      alpha = uniroot(function(a) # MOM estimates
        log(gamma(1+2/a))-2*log(gamma(1+1/a))-log(var(time)+(mean(time))^2)+2*log(mean(time)),
        c(0.01,10))$root
      lambda = gamma(1+1/alpha)/mean(time)
    }else if(model=="exponential"){
      lambda = 1/mean(time)
    }
    inits = list(itct=itct, b_u=b_u, beta_u=beta_u, lambda=lambda)
    if(model=="weibull") inits$alpha = alpha
    return(inits)
  }

inits_check <-
  function(model, N, penalty.factor.inc, penalty.factor.lat, inits){
    if(is.null(inits$itct)){
      warning("Initial value of intercept is missing. Initial values were generated by the program.")
      return(NULL)
    }
    if(model=="cox"){
      if(is.null(inits$survprob)){
        warning("Initial value of survprob is missing. Initial values were generated by the program.")
        return(NULL)
      }else if(length(inits$survprob)!=N){
        warning("Initial value of survprob has incorrect dimension. Initial values were generated by the program.")
        return(NULL)
      }
    }
    if(model %in% c("weibull","exponential")){
      if(is.null(inits$lambda)){
        warning("Initial value of lambda is missing. Initial values were generated by the program.")
        return(NULL)
      }else if(inits$lambda<=0){
        warning("Initial value of lambda should be positive. Initial values were generated by the program.")
        return(NULL)
      }
    }
    if(model=="weibull"){
      if(is.null(inits$alpha)){
        warning("Initial value of alpha is missing. Initial values were generated by the program.")
        return(NULL)
      }else if(inits$alpha<=0){
        warning("Initial value of alpha should be positive. Initial values were generated by the program.")
        return(NULL)
      }
    }
    if(any(penalty.factor.inc==0))
      if(is.null(inits$b_u)){
        warning("Initial value of b_u is missing. Initial values were generated by the program.")
        return(NULL)
      } else if(length(inits$b_u)!=sum(penalty.factor.inc==0)){
        warning("Initial value of b_u has incorrect dimension. Initial values were generated by the program.")
        return(NULL)
      }
    if(any(penalty.factor.lat==0))
      if(is.null(inits$beta_u)){
        warning("Initial value of beta_u is missing. Initial values were generated by the program.")
        return(NULL)
      }else if(length(inits$beta_u)!=sum(penalty.factor.lat==0)){
        warning("Initial value of beta_u has incorrect dimension. Initial values were generated by the program.")
        return(NULL)
      }
    return(inits)
  }

l1_grad_b <-
  function(theta, itct, b_u, X_u , X_p, pir, mu){
    N = nrow(X_p)
    J = ncol(X_p)
    b_p_ext = theta
    b_p = b_p_ext[1:J]-b_p_ext[(J+1):(2*J)]
    b_nonzero = which(b_p!=0)
    if(!is.null(X_u)) bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    pix = 1/(1+exp(-bx))
    grad = matrix(pir-pix,1) %*% X_p
    return(c(-grad+N*mu, grad+N*mu))
  }

l1_gradient_inc <-
  function(theta, b_p, X_u , X_p, pir, mu) # incidence
  {
    itct = theta[1]
    if(!is.null(X_u)) b_u = theta[2:(ncol(X_u)+1)]
    b_nonzero = which(b_p!=0)
    if(!is.null(X_u)) bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    pix = 1/(1+exp(-bx))
    grad1 = sum(pir-pix)
    if(!is.null(X_u)) grad2 = matrix(pir-pix,1) %*% X_u else grad2=NULL
    return(-c(grad1, grad2))
  }

l1_negloglik_inc_b <-
  function(theta, itct, b_u, X_u , X_p, pir, mu) # incidence
  {
    N = nrow(X_p)
    J = ncol(X_p)
    b_p_ext = theta
    b_p = b_p_ext[1:J]-b_p_ext[(J+1):(2*J)]
    b_nonzero = which(b_p!=0)
    if(!is.null(X_u)) bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    ll = sum(pir*bx - log(1+exp(bx)))-N*mu*sum(b_p_ext)
    return(-ll)
  }

l1_negloglik_inc <-
  function(theta, b_p, X_u , X_p, pir, mu) # incidence
  {
    itct = theta[1]
    if(!is.null(X_u)) b_u = theta[2:(ncol(X_u)+1)]
    N = nrow(X_p)
    b_nonzero = which(b_p!=0)
    if(!is.null(X_u)) bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    ll = sum(pir*bx - log(1+exp(bx)))-N*mu*sum(abs(b_p))
    return(-ll)
  }

nonparametric_time_generator <-
  function (N, beta_w, maxT = 20, knots = 8)
  {
    time <- 0:maxT
    k <- c(0, sort(sample(time[2:maxT], size = knots, replace = FALSE)), maxT)
    heights <- c(0, sort(1-pmin(rexp(knots, rate = 10),1-1e-15)), 1)
    # heights <- c(0, sort(runif(knots)), 1)
    tk <- merge(data.frame(time), data.frame(time = k, heights),
                by = "time", all = FALSE)
    MonotonicSpline <- stats::splinefun(x = tk$time, y = tk$heights,
                                        method = "hyman")
    t = rep(NA, N)
    for(i in 1:N){
      u = runif(1)
      t[i] = uniroot(function(a) (1-MonotonicSpline(a))^exp(beta_w[i])-u, c(0,maxT))$root
    }
    return(t)
  }

mcp_penalty <-
  function(b_p, gamma, lambda){
    idx = which(b_p !=0 & abs(b_p) <= gamma*lambda)
    s1 = lambda*sum(abs(b_p[idx]))-sum((b_p[idx])^2)/(2*gamma)
    s2 = gamma*lambda^2/2 * sum(abs(b_p) > gamma*lambda)
    return(s1 + s2)
  }

mcp_scad_cd_upd_inc <-
  function(penalty, pir, pi_x, x_l, bl, gamma, lambda){
    d1 = mean((pir - pi_x)*x_l)
    vl = mean(pi_x*(1-pi_x)*x_l^2)
    zl = d1 + vl * bl
    if(penalty == "MCP"){
      if(abs(bl) <= gamma*lambda) return(soft(zl, lambda)/(vl-1/gamma))
      else return(ifelse(vl==0,0,zl/vl))
    }  else if(penalty == "SCAD"){
      if(abs(bl)<=lambda) return(ifelse(vl==0,0,soft(zl, lambda)/vl))
      else if(abs(bl)<=gamma*lambda) return(soft(zl, gamma*lambda/(gamma-1))/(vl-1/(gamma-1)))
      else return(ifelse(vl==0,0,zl/vl))
    }
  }

mcp_scad_cd_upd_lat <-
  function(penalty, exp_beta_w_p, w_l, betal, Zl, I0, d_j, gamma, lambda){
    N = length(exp_beta_w_p)
    sum0 = pmax(1e-50, matrix(exp_beta_w_p,1) %*% I0)   # 1*n0
    sum1 = matrix(exp_beta_w_p * w_l,1) %*% I0   # 1*n0
    sum2 = matrix(exp_beta_w_p * w_l^2,1) %*% I0   # 1*n0
    d1 = sum(Zl - d_j * sum1/sum0)/N
    vl = -sum(d_j * (sum1^2/sum0^2 - sum2/sum0))/N
    zl = d1 + vl * betal
    if(penalty == "MCP"){
      if(abs(betal) <= gamma*lambda) return(soft(zl, lambda)/(vl-1/gamma))
      else return(ifelse(vl==0,0,zl/vl))
    }  else if(penalty == "SCAD"){
      if(abs(betal)<=lambda) return(ifelse(vl==0,0,soft(zl, lambda)/vl))
      else if(abs(betal)<=gamma*lambda) return(soft(zl, gamma*lambda/(gamma-1))/(vl-1/(gamma-1)))
      else return(ifelse(vl==0,0,zl/vl))
    }
  }

mcp_scad_gradient_inc <-
  function(theta, b_p, X_u , X_p, pir, CAP) # incidence
  {
    itct = theta[1]
    if(!is.null(X_u)) b_u = theta[2:(ncol(X_u)+1)]
    b_nonzero = which(b_p!=0)
    if(is.null(X_u)) bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    pix = 1/(1+exp( pmin(CAP, -bx) ))
    grad1 = sum(pir-pix)
    if(!is.null(X_u)) grad2 = matrix(pir-pix,1) %*% X_u else grad2 = NULL
    return(-c(grad1, grad2))
  }

mcp_scad_gradient_lat <-
  function(theta, beta_p, W_u, W_p, delta, pir, I0, d_j, CAP){ # latency
    beta_u = theta
    N = nrow(W_p)
    beta_nonzero = which(beta_p!=0)
    beta_w = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    exp_beta_w_p = exp( pmin(CAP, beta_w) ) * pir
    sum0 = pmax(1e-50, matrix(exp_beta_w_p,1) %*% I0)   # 1*n0
    sum1 = t(W_u) %*% diag(as.numeric(exp_beta_w_p)) %*% I0   # ncol(W_u) * n0
    du = colSums(W_u[delta==1,,drop=FALSE])- rowSums(sum1 %*% diag(as.numeric(d_j/sum0))) # ncol(W_u) * 1
    return(-du)
  }

mcp_scad_negloglik_inc <-
  function(theta, b_p, X_u , X_p, pir, CAP) # incidence
  {
    itct = theta[1]
    if(!is.null(X_u)) b_u = theta[2:(ncol(X_u)+1)]
    N = nrow(X_p)
    b_nonzero = which(b_p!=0)
    if(is.null(X_u)) bx = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else bx = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    ll = sum(pir*bx - log(1+exp(pmin(CAP, bx))))
    return(-ll)
  }

mcp_scad_negloglik_lat <-
  function(theta, beta_p, W_u, W_p, delta, pir, I0, d_j, CAP){ # latency
    beta_u = theta
    N = nrow(W_p)
    beta_nonzero = which(beta_p!=0)
    beta_w = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    exp_beta_w_p = exp( pmin(CAP, beta_w) ) * pir
    sum0 = pmax(1e-50, matrix(exp_beta_w_p,1) %*% I0)  # 1*n0
    ll = sum(beta_w[delta==1]) - sum(d_j * log(sum0))
    return(-ll)
  }

scad_penalty <-
  function(b_p, gamma, lambda){
    idx1 = which(b_p !=0 & abs(b_p) <= lambda)
    idx2 = which(abs(b_p) > lambda & abs(b_p)<=gamma*lambda)
    s1 = lambda*sum(abs(b_p[idx1]))
    s2 = (gamma*lambda*sum(abs(b_p[idx2])) - 0.5*(sum((b_p[idx2])^2 + lambda^2)))/(gamma-1)
    s3 = lambda^2*(gamma^2-1)/(2*(gamma-1)) * sum(abs(b_p) > gamma*lambda)
    return(s1 + s2 + s3)
  }

self_scale <-
  function(X, scale){
    if(is.null(X)) return(NULL)
    if(ncol(X)==0) return(NULL)
    if (scale) {
    n = nrow(X)
    Xs = apply(X, 2, function(x) if(sd(x)==0) return(rep(0,n)) else return(scale(x)))
    } else {
      Xs = X
    }
    return(Xs)
  }

soft <-
  function(z, gamma){
    if(gamma >= abs(z)) return(0)
    else return(ifelse(z>0, z-gamma, z+gamma))
  }

weib_EM <-
  function(X_u=NULL, X_p, W_u=NULL, W_p, time, delta, mu_inc, mu_lat,
           inits=NULL, nIter = 100, tol = 1e-4)
  {
    # mu: penalty parameter
    # tol: difference between log-likelihood
    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates

    ######## initialization ########
    step = 1
    b_p <- rep(0,J)
    beta_p <- rep(0,M)
    b_p_ext <- rep(0,2*J)
    beta_p_ext <- rep(0,2*M)
    if(is.null(inits)) inits = initialization_parametric(X_u, W_u, time, delta, model="weibull")
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u
    lambda = inits$lambda; alpha = inits$alpha
    log_alpha = log(max(alpha, 1e-15))
    log_lambda = log(max(lambda, 1e-15))
    pir = rep(1, N)
    llp1_0 <- llp2_0 <-0
    lik_inc <- lik_lat <- c()
    b_p_path <- beta_p_path <- b_u_path <- beta_u_path <- itct_path <- alpha_path<-lambda_path<-NULL
    conv1 <- conv2 <- FALSE

    ######## loop ########

    repeat{

      #### E-step
      if(is.null(X_u)) b_x = itct + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
      else b_x = itct + X_u %*% b_u + X_p[,b_p!=0,drop=FALSE] %*% b_p[b_p!=0]
      if(is.null(W_u)) beta_w = W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
      else beta_w = W_u %*% beta_u + W_p[,beta_p!=0,drop=FALSE] %*% beta_p[beta_p!=0]
      pir[delta==0] = 1/(1+exp(-b_x[delta==0]+
                                 (lambda * time[delta==0])^alpha * exp(beta_w[delta==0])))
      #### update penalized parameters
      if (!conv1){
        b_p_ext = optim(par = b_p_ext, fn = l1_negloglik_inc_b, gr = l1_grad_b,
                        itct = itct, b_u = b_u, X_u = X_u, X_p = X_p,
                        pir = pir, mu = mu_inc, method = 'L-BFGS-B', lower = rep(0, 2*J))$par
        b_p = b_p_ext[1:J]-b_p_ext[(J+1):(2*J)]
      }

      if (!conv2){
        beta_p_ext = optim(par = beta_p_ext, fn = weib_negloglik_lat_beta, gr = weib_grad_beta,
                           alpha = alpha, lambda = lambda,
                           beta_u = beta_u, W_u = W_u, W_p = W_p, time = time, delta = delta, pir = pir, mu = mu_lat,
                           method = 'L-BFGS-B', lower = rep(0, 2*M))$par
        beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
      }

      #### update nonpenalized parameters
      if (!conv1){
        out_inc = optim(par = c(itct,b_u), fn = l1_negloglik_inc, gr = l1_gradient_inc,
                        b_p = b_p, X_u =X_u, X_p = X_p, pir = pir, mu = mu_inc, method="BFGS") # incidence
        itct = out_inc$par[1]
        if(!is.null(X_u)) b_u = out_inc$par[2:(ncol(X_u)+1)]
        llp1_1 = -out_inc$value
      }
      lik_inc = c(lik_inc, llp1_1)

      if (!conv2){
        out_lat = optim(par = c(log_alpha, log_lambda, beta_u),
                        fn = weib_negloglik_lat, gr = weib_gradient_lat,
                        beta_p=beta_p, W_u=W_u, W_p=W_p, time=time, delta=delta, pir=pir, mu=mu_lat,
                        method="BFGS") # latency
        log_alpha = out_lat$par[1]
        alpha = exp(log_alpha)
        log_lambda = out_lat$par[2]
        lambda = exp(log_lambda)
        if(!is.null(W_u)) beta_u = out_lat$par[3:(ncol(W_u)+2)] # unpenalized incidence
        llp2_1 = -out_lat$value
      }
      lik_lat = c(lik_lat, llp2_1)

      ## record updated parameters
      itct_path = c(itct_path, itct)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      b_p_path = rbind(b_p_path, b_p)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)
      beta_p_path = rbind(beta_p_path, beta_p)
      alpha_path = c(alpha_path, alpha)
      lambda_path = c(lambda_path, lambda)
      if (!conv1 & abs(llp1_1- llp1_0)< tol) conv1 <- TRUE
      if (!conv2 & abs(llp2_1- llp2_0)< tol) conv2 <- TRUE
      if (step > 1 & (conv1 & conv2) | step >= nIter) { #
        break
      }
      llp1_0 <- llp1_1
      llp2_0 <- llp2_1
      step <- 1 + step
    }

    ######## output ########
    output <- list(b_p_path = b_p_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_p_path = beta_p_path, beta_u_path = beta_u_path,
                   alpha_path = alpha_path, lambda_path = lambda_path,
                   lik_inc = lik_inc, lik_lat = lik_lat)
    output
  }

weib_grad_beta <-
  function(theta, alpha, lambda, beta_u, W_u, W_p, time, delta, pir, mu){
    N = nrow(W_p)
    M = ncol(W_p)
    beta_p_ext = theta
    beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = pir*(lambda*time)^alpha*exp(betaw)
    grad = matrix(delta-temp1,1)%*% W_p
    return(c(-grad+ N*mu, grad+ N*mu))
  }

weib_gradient_lat <-
  function(theta, beta_p, W_u, W_p, time, delta, pir, mu){ # latency
    log_alpha = theta[1]
    log_lambda = theta[2]
    alpha = exp(log_alpha)
    lambda = exp(log_lambda)
    if(!is.null(W_u)) beta_u = theta[3:(ncol(W_u)+2)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = pir*(lambda*time)^alpha*exp(betaw)
    temp2 = delta-temp1
    grad1 = sum(delta+log(pmax(lambda * time,1e-100))*temp2*alpha)
    grad2 = sum(alpha*temp2)
    if(!is.null(W_u)) grad3 = matrix(temp2,1) %*% W_u else grad3=NULL
    return(-c(grad1, grad2, grad3))
  }

weib.cure.update <-
  function(alpha, lambda, b_p, beta_p, b_u, itct, beta_u, X_u , X_p, W_u, W_p, time, delta, epsilon)
  {
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) C_b = exp(itct + X_u %*% b_u + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    else C_b = exp(itct + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    if(!is.null(W_u)) C_beta = exp(W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    else C_beta = exp(W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    C_lbeta = exp(-(lambda * time)^alpha * C_beta)
    temp2 = (lambda * time)^alpha * C_beta
    temp3 = 1 / (1 + C_b*C_lbeta)
    temp4 = (1-delta) * C_b * C_lbeta * temp2 * temp3
    temp5 = delta * (1-temp2)
    temp6 = 1/(1+C_b)
    ### update b_p
    grad_b = matrix(temp6*(delta - (1-delta) * C_b * (1-C_lbeta) * temp3),1) %*% X_p # length J
    j_b = which.max(grad_b)
    b_p[j_b] = b_p[j_b] + epsilon
    ### update beta_p
    grad_beta = matrix(temp5 - temp4,1) %*% W_p # length M
    j_beta = which.max(grad_beta)
    beta_p[j_beta] = beta_p[j_beta] + epsilon

    return(list(b_p = b_p, beta_p = beta_p))
  }

weibull.cure <-
  function(X_u=NULL, X_p, W_u=NULL, W_p, time, delta, epsilon = 0.001, tol = 1e-05,
           nIter=1e4, inits=NULL, verbose)
  {

    # X_u: N by nonp, non-penalized covariate matrix associated with incidence
    # X_p: N by J, penalized covariate matrix associated with incidence
    # W_u: N by nonp, non-penalized covariate matrix associated with latency
    # W_p:  N by M, penalized covariate matrix associated with incidence
    # time: vector of length N, observed survival time
    # delta: vector of length N, censoring status (not censored = 0)
    # epsilon: incremental size
    # tol: difference between log-likelihood

    N = length(time)
    J = ncol(X_p) # number of penalized incidence covariates
    M = ncol(W_p) # number of penalized latency covariates
    X_p = cbind(X_p, -X_p) # N by 2J
    W_p = cbind(W_p, -W_p) # N by 2M

    ######## initialization ########
    step = 1
    b_p = rep(0, 2*J) # penalized incidence
    beta_p = rep(0, 2*M) # penalized latency
    if(is.null(inits)) inits = initialization_parametric(X_u, W_u, time, delta, model="weibull")
    itct = inits$itct; b_u = inits$b_u; beta_u = inits$beta_u
    lambda = inits$lambda; alpha = inits$alpha
    log_alpha = log(max(alpha, 1e-15))
    log_lambda = log(max(lambda, 1e-15))
    LL0 = 0

    b_p_path <- beta_p_path <- alpha_path <- lambda_path <- b_u_path <- beta_u_path <- itct_path <- NULL
    logLikelihood <- numeric()

    ######## loop ########

    repeat{

      #### update penalized parameters
      upd = weib.cure.update(alpha, lambda, b_p, beta_p, b_u, itct, beta_u, X_u , X_p, W_u, W_p, time, delta, epsilon)
      b_p = upd$b_p
      beta_p = upd$beta_p
      b_p_path = rbind(b_p_path, b_p)
      beta_p_path = rbind(beta_p_path, beta_p)

      #### update other parameters
      out = optim(par = c(log_alpha, log_lambda, b_u, itct, beta_u), fn = weib.cure.negloglik, gr = weib.cure.gradient,
                  b_p = b_p, beta_p = beta_p, X_u = X_u, X_p = X_p, W_u = W_u, W_p = W_p,
                  time = time, delta = delta, method="BFGS")
      log_alpha = out$par[1]
      alpha = exp(log_alpha)
      log_lambda = out$par[2]
      lambda = exp(log_lambda)
      if(!is.null(X_u)) {
        b_u = out$par[3:(ncol(X_u)+2)] # unpenalized incidence
        itct = out$par[ncol(X_u)+3]
        if(!is.null(W_u)) beta_u = out$par[(ncol(X_u)+4):(ncol(X_u)+ncol(W_u)+3)] # unpenalized latency
      } else{
        itct = out$par[3]
        if(!is.null(W_u)) beta_u = out$par[4:(ncol(W_u)+3)] # unpenalized latency
      }

      alpha_path = c(alpha_path, alpha)
      lambda_path = c(lambda_path, lambda)
      if(!is.null(X_u)) b_u_path = rbind(b_u_path, b_u)
      itct_path = c(itct_path, itct)
      if(!is.null(W_u)) beta_u_path = rbind(beta_u_path, beta_u)

      LL1 = -out$value
      logLikelihood = c(logLikelihood, LL1)

      if(verbose & step%%1000==0) message("step = ", step, "\n")
      if (step > 1 && (abs(LL1 - LL0) < tol | step >= nIter)) { #
        break
      }
      LL0 <- LL1
      step <- 1 + step
    }

    ######## output ########
    b_p_path = b_p_path[,1:J] - b_p_path[,(J+1):(2*J)]
    beta_p_path = beta_p_path[,1:M] - beta_p_path[,(M+1):(2*M)]
    output <- list(b_p_path = b_p_path, beta_p_path = beta_p_path, alpha_path = alpha_path,
                   lambda_path = lambda_path, b_u_path = b_u_path, itct_path = itct_path,
                   beta_u_path = beta_u_path, logLikelihood = logLikelihood)
    output
  }

weib.cure.gradient <-
  function(theta, b_p, beta_p, X_u , X_p, W_u, W_p, time, delta)
  {
    N = length(time)
    alpha = exp(theta[1])
    lambda = exp(theta[2])
    if(!is.null(X_u)) {
      b_u = theta[3:(ncol(X_u)+2)] # unpenalized incidence
      itct = theta[ncol(X_u)+3]
      if(!is.null(W_u)) beta_u = theta[(ncol(X_u)+4):(ncol(X_u)+ncol(W_u)+3)] # unpenalized latency
    } else{
      itct = theta[3]
      if(!is.null(W_u)) beta_u = theta[4:(ncol(W_u)+3)] # unpenalized latency
    }
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) C_b = exp(itct + X_u %*% b_u + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    else C_b = exp(itct + X_p[,b_nonzero,drop=FALSE] %*% b_p[b_nonzero])
    if(!is.null(W_u)) C_beta = exp(W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    else C_beta = exp(W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero])
    C_lbeta = exp(-(lambda * time)^alpha * C_beta)
    temp1 = log (pmax(lambda*time,rep(1e-100,N)))
    temp2 = (lambda * time)^alpha * C_beta
    temp3 = 1 / (1 + C_b*C_lbeta)
    temp4 = (1-delta) * C_b * C_lbeta * temp2 * temp3
    temp5 = delta * (1-temp2)
    temp6 = 1/(1+C_b)
    temp7 = temp6*(delta - (1-delta) * C_b * (1-C_lbeta) * temp3)
    grad1 = sum(delta * (1/alpha + temp1 - temp2 * temp1) - temp4 * temp1) * alpha
    grad2 = alpha * sum(temp5 - temp4)
    if(!is.null(X_u)) grad3 = matrix(temp7,1) %*% X_u else grad3=NULL
    grad4 = sum(temp7) # intercept
    if(!is.null(W_u)) grad5 = matrix(temp5 - temp4, 1) %*% W_u else grad5=NULL
    return(-c(grad1, grad2, grad3, grad4, grad5))
  }

weib.cure.negloglik <-
  function(theta, b_p, beta_p, X_u , X_p, W_u, W_p, time, delta)
  {
    N = length(time)
    alpha = exp(theta[1])
    lambda = exp(theta[2])
    if(!is.null(X_u)) {
      b_u = theta[3:(ncol(X_u)+2)] # unpenalized incidence
      itct = theta[ncol(X_u)+3]
      if(!is.null(W_u)) beta_u = theta[(ncol(X_u)+4):(ncol(X_u)+ncol(W_u)+3)] # unpenalized latency
    } else{
      itct = theta[3]
      if(!is.null(W_u)) beta_u = theta[4:(ncol(W_u)+3)] # unpenalized latency
    }
    b_nonzero = which(b_p!=0)
    beta_nonzero = which(beta_p!=0)
    if(!is.null(X_u)) logC_b = itct + X_u %*% b_u + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    else logC_b = itct + X_p[,b_nonzero, drop=FALSE] %*% b_p[b_nonzero]
    if(!is.null(W_u)) logC_beta = W_u %*% beta_u + W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero]
    else logC_beta = W_p[,beta_nonzero,drop=FALSE] %*% beta_p[beta_nonzero]
    temp1 = 1/(1+exp(-logC_b)) # exp(logC_b) / (1+exp(logC_b)) #Mar17
    logC_lbeta = -(lambda * time)^alpha * exp(logC_beta)
    ll1 = delta * (log(temp1) + theta[2] + theta[1] +
                     (alpha-1)*log(pmax(lambda * time,rep(1e-100,N))) +
                     logC_beta + logC_lbeta)
    ll2 = (1-delta) * log(pmax(1 - temp1 * (1-exp(logC_lbeta)),rep(1e-100,N)))
    return(-sum(ll1+ll2))
  }

weib_negloglik_lat_beta <-
  function(theta, alpha, lambda, beta_u, W_u, W_p, time, delta, pir, mu){ # latency
    N = nrow(W_p)
    M = ncol(W_p)
    beta_p_ext = theta
    beta_p = beta_p_ext[1:M]-beta_p_ext[(M+1):(2*M)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    ll1 = delta*( log(alpha)+log(lambda) +(alpha-1)*log(pmax(lambda * time,1e-100))+betaw )
    ll2 = -pir*(lambda*time)^alpha*exp(betaw)
    return(-sum(ll1+ll2)+N*mu*sum(beta_p_ext))
  }

weib_negloglik_lat <-
  function(theta, beta_p, W_u, W_p, time, delta, pir, mu){ # latency
    N = nrow(W_p)
    log_alpha = theta[1]
    log_lambda = theta[2]
    alpha = exp(log_alpha)
    lambda = exp(log_lambda)
    if(!is.null(W_u)) beta_u = theta[3:(ncol(W_u)+2)]
    beta_nonzero = which(beta_p!=0)
    if(!is.null(W_u)) betaw = W_u %*% beta_u + W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    else betaw = W_p[,beta_nonzero, drop=FALSE] %*% beta_p[beta_nonzero]
    ll1 = delta*( log_alpha+log_lambda +(alpha-1)*log(pmax(lambda * time,1e-100))+betaw )
    ll2 = -pir*(lambda*time)^alpha*exp(betaw)
    return(-sum(ll1+ll2)+N*mu*sum(abs(beta_p)))
  }

# If B = NULL use exponential; otherwise use uniform with B=upper limit
sim_cure <-
  function(n, mu=1, censor.mu=3, B=NULL, Reps=10000, seed=NULL) {
    susceptible.prop<-numeric(length=Reps)
    if (!is.null(seed)) {
      set.seed(seed)
    }
    for (i in 1:Reps) {
      surv<-rexp(n, 1/mu)
      if (is.null(B)) {
        u<-rexp(n, 1/censor.mu) ## rate is 1/mu
      } else {
        u<-runif(n,0,B)
      }
      time<-ifelse(surv<=u,surv,u)
      censor<-ifelse(surv<=u,1,0)
      fit<-survival::survfit(survival::Surv(time,censor)~1)
      susceptible.prop[i]<-1-min(fit$surv)
    }
    susceptible.prop
  }

AUC_msi <- function(cure_cutoff = 5, b_u_hat=NULL, itct_hat, b_p_hat, testing_delta, testing_time,
                    X_u=NULL, X_p){
  testing_n = length(testing_time)
  v = rep(0, testing_n)
  y = rep(999, testing_n)
  y[testing_time>cure_cutoff] = 0
  y[testing_time<=cure_cutoff & testing_delta==1] = 1
  v[y<2] = 1
  if(all(b_p_hat==0)) {
    if(is.null(X_u)) xb = rep(itct_hat, testing_n) else xb = itct_hat+X_u %*% b_u_hat
  } else{
    if(is.null(X_u)) xb = itct_hat + X_p[,b_p_hat!=0,drop=FALSE] %*% b_p_hat[b_p_hat!=0]
    else xb = itct_hat+X_u %*% b_u_hat + X_p[,b_p_hat!=0,drop=FALSE] %*% b_p_hat[b_p_hat!=0]
  }
  p_hat = 1/(1+exp(-xb))
  temp = v*y + (1-v)*p_hat
  temp1 = temp[order(p_hat, decreasing = T)]
  temp_f = v*(1-y) + (1-v) * (1-p_hat)
  temp1f = temp_f[order(p_hat, decreasing = T)]
  TPR = c(0, cumsum(temp1)/cumsum(temp1)[testing_n])
  FPR = c(0, cumsum(temp1f)/cumsum(temp1f)[testing_n])
  height = (TPR[-1]+TPR[-length(TPR)])/2
  width = diff(FPR)
  auc_msi = sum(height*width)
  return(auc_msi)
}


C.stat <-
function (cure_cutoff = 5, b_u_hat = NULL, itct_hat, b_p_hat,
          beta_u_hat = NULL, beta_p_hat, testing_delta, testing_time,
          X_u = NULL, X_p, W_u = NULL, W_p)
{
  C_csw_num = 0
  C_csw_denom = 0
  testing_n = length(testing_time)
  v = rep(0, testing_n)
  y = rep(999, testing_n)
  y[testing_time > cure_cutoff] = 0
  y[testing_time <= cure_cutoff & testing_delta == 1] = 1
  v[y < 2] = 1
  if (all(b_p_hat == 0)) {
    if (is.null(X_u))
      p_hat = 1/(1 + exp(-itct_hat))
    else p_hat = 1/(1 + exp(-itct_hat - X_u %*% b_u_hat))
  }
  else {
    if (is.null(X_u))
      p_hat = 1/(1 + exp(-itct_hat - X_p[, b_p_hat != 0,
                                         drop = FALSE] %*% b_p_hat[b_p_hat != 0]))
    else p_hat = 1/(1 + exp(-itct_hat - X_u %*% b_u_hat -
                              X_p[, b_p_hat != 0, drop = FALSE] %*% b_p_hat[b_p_hat !=
                                                                              0]))
  }
  temp = v * y + (1 - v) * as.vector(p_hat)
  if (all(beta_p_hat == 0)) {
    if (is.null(W_u))
      W_beta = rep(0, testing_n)
    else W_beta = W_u %*% beta_u_hat
  }
  else {
    if (is.null(W_u))
      W_beta = W_p[, beta_p_hat != 0, drop = FALSE] %*%
        beta_p_hat[beta_p_hat != 0]
    else W_beta = W_u %*% beta_u_hat + W_p[, beta_p_hat !=
                                             0, drop = FALSE] %*% beta_p_hat[beta_p_hat != 0]
  }
  for (i in 1:testing_n) for (j in 1:testing_n) {
    if (j == i | !testing_delta[i] | testing_time[i] > testing_time[j])
      next
    I_ij = testing_time[i] < testing_time[j] | (testing_time[i] ==
                                                  testing_time[j] & !testing_delta[j])
    if (!I_ij)
      next
    if (W_beta[i] > W_beta[j])
      C_csw_num = C_csw_num + temp[j]
    C_csw_denom = C_csw_denom + temp[j]
  }
  return(C_csw_num/C_csw_denom)
}
Any scripts or data that you put into this service are public.
hdcuremodels documentation built on June 22, 2024, 10:17 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hdcuremodels
Penalized Mixture Cure Models for High-Dimensional Data

R/zzz.R
In hdcuremodels: Penalized Mixture Cure Models for High-Dimensional Data

Try the hdcuremodels package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hdcuremodels Penalized Mixture Cure Models for High-Dimensional Data

R/zzz.R In hdcuremodels: Penalized Mixture Cure Models for High-Dimensional Data

Try the hdcuremodels package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hdcuremodels
Penalized Mixture Cure Models for High-Dimensional Data

R/zzz.R
In hdcuremodels: Penalized Mixture Cure Models for High-Dimensional Data