inst/sim/replication_script_for_all_new.R

# This replication script includes all demonstration codes and simulation codes
# for manuscript `geeVerse: Ultra-high Dimensional Heterogeneous Data Analysis
# with Generalized Estimating Equations` submitted to the Journal of Statistical
# Software.The paper uses 100 replications. The results in the Monte Carlo
# simulation study may be slightly different due to randomness.


# Before you run the script, make sure you have the geeVerse package installed.
# It can be installed from a source package provided or directly from CRAN.
library(geeVerse)
library(tictoc)

tic()
# Section 4.1 -------------------------------------------------------------

## Example 1 Demonstration Script-----------------------------------------
set.seed(2024)
nsub = 100
sim_data = generateData(nsub= nsub, nobs = rep(10, nsub),  p = 200,
       beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
y = sim_data$y
x = sim_data$X

QPGEE_results = qpgee(x, y, nobs=rep(10, nsub), correlation = "exchangeable",
                      tau = 0.9, max_it=100, method = "HBIC")
names(QPGEE_results)
compile_result(QPGEE_results, beta0= c(rep(1,7),rep(0,193)), cutoff = 0.1)

## Monte Carlo Simulation Script for Table 2 -----------------------------
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 100
  sim_data = generateData(nsub= nsub, nobs = rep(10, nsub),  p = 200,
        beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
  y = sim_data$y
  x = sim_data$X
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  tau_vec = c(0.1,0.5,0.9)
  for (correlation in correlation_vec){
    for(tau in tau_vec){
      QPGEE_results = qpgee(x, y, nobs=rep(10, nsub), correlation = correlation,
                            tau = tau, max_it=100, method = "HBIC",ncore = 10)
      res_list[[correlation]][[as.character(tau)]][[m]] = QPGEE_results
    }
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 2;
#Note: users can update parameters for other working correlation structures and tau results
compile_result(res_list[["independence"]][[as.character(0.5)]],
               beta0= c(rep(1,7),rep(0,193)))
toc()
tic()
## Example 2 Demonstration Script-----------------------------------------
qpgee(x, y , nobs = rep(1,1000), correlation = "independence", tau = 0.9,
      max_it = 100, method = "HBIC")

## Monte Carlo Simulation  Script for Table 3 -----------------------------
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 100
  set.seed(2024)
  sim_data = generateData(nsub= nsub, nobs = rep(10, nsub),  p = 200,
                          beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "indepedence")
  y = sim_data$y
  x = sim_data$X
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence")
  tau_vec = c(0.1,0.5,0.9)
  for (correlation in correlation_vec){
    for(tau in tau_vec){
      QPGEE_results = qpgee(x, y, nobs=rep(1, 1000), correlation = correlation,
                            tau = tau, max_it=100, method = "HBIC",ncore = 10)
      res_list[[correlation]][[as.character(tau)]][[m]] = QPGEE_results
    }
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 3;
#Note: users can update parameters for other working correlation structures and tau results
compile_result(res_list[["independence"]][[as.character(0.5)]],
               beta0= c(rep(1,7),rep(0,193)))
toc()
tic()
## Example 3 Demonstration Script-----------------------------------------
#generate imbalanced data
nobs = c(rep(10,nsub/2),rep(5,nsub/2))
sim_data = generateData(nsub= 100, nobs = nobs, p = 200,
                        beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
y = sim_data$y
x = sim_data$X
# fit qpgee
qpgee(x, y, nobs = c(rep(10,nsub/2),rep(5,nsub/2)),
      correlation = "exchangeable", tau =0.9, max_it=100, method = "HBIC")

## Monte Carlo Simulation Script for Table 4 ---------------------------------
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 100
  set.seed(2024)
  nobs = c(rep(10,nsub/2),rep(5,nsub/2))
  sim_data = generateData(nsub= 100, nobs = nobs, p = 200,
                          beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
  y = sim_data$y
  x = sim_data$X
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  tau_vec = c(0.1,0.5,0.9)
  for (correlation in correlation_vec){
    for(tau in tau_vec){
      QPGEE_results = qpgee(x, y, nobs=nobs, correlation = correlation,
                            tau = tau, max_it=100, method = "HBIC",ncore = 10)
      res_list[[correlation]][[as.character(tau)]][[m]] = QPGEE_results
    }
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 4;
#Note: users can update parameters for other working correlation structures and tau results
compile_result(res_list[["independence"]][[as.character(0.5)]],
               beta0= c(rep(1,7),rep(0,193)))
toc()
tic()
# Section 4.2 -------------------------------------------------------------
# Monte Carlo Simulation Study and Screening Option
# This subsection includes codes for table 5 and table 6.
# Note we use 2 replications for computational efficiency instead of 100 replications
# as in the paper, so our results will likely be different than the 100 replication simulation based tables in the paper.

## 4.2 Demonstration Script------------------------------------------------
nsub = 100
sim_data = generateData(nsub = nsub, nobs = rep(10, nsub), p = 1000,
                        beta0 = c(rep(1,7),rep(0,993)), rho = 0.6, correlation = "exchangeable")
y = sim_data$y
x = sim_data$X

xsis = x[,SIS::SIS(x,y,nsis=200)$sis.ix0]

qpgee(xsis, y, nobs=rep(10, nsub), correlation  = "exchangeable",
      tau=0.9, max_it=100, method = "HBIC", ncore = 10)

## Monte Carlo Simulation Script for Table 5 ----------------------------------
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 100
  sim_data = generateData(nsub= nsub, nobs = rep(10, nsub),  p = 200,
                          beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
  y = sim_data$y
  x = sim_data$X
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  tau_vec = c(0.1,0.5,0.9)
  for (correlation in correlation_vec){
    for(tau in tau_vec){
      QPGEE_results = qpgee(x, y, nobs=rep(10, nsub), correlation = correlation,
                            tau = tau, max_it=100, method = "HBIC",ncore = 10)
      res_list[[correlation]][[as.character(tau)]][[m]] = QPGEE_results
    }
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 5;
#Note: users can update parameters for other working correlation structures
compile_result(res_list[["independence"]][[as.character(0.5)]],
               beta0= c(rep(1,7),rep(0,193)))

## Demostration Script for example 4 ---------------------------
set.seed(2024)
sim_data = generateData(nsub = nsub, nobs = rep(10, nsub), p = 1000,
                        beta0 = c(rep(1,7),rep(0,993)), rho = 0.6, correlation = "exchangeable")
y = sim_data$y
x = sim_data$X
x_sis_ind = SIS::SIS(x,y,nsis=200)$sis.ix0
xsis = x[,x_sis_ind]
QPGEE_results_highdim = qpgee(xsis, y, nobs=rep(10, nsub),
                              correlation = "exchangeable", tau=0.9, max_it=100, method = "HBIC", ncore = 10)

QPGEE_results_highdim$beta <- replace(vector("numeric", 1000), x_sis_ind, QPGEE_results$beta)
compile_result(QPGEE_results_highdim,beta0= c(rep(1,7),rep(0,993)))

## Monte Carlo Simulation Script for Table 6 ---------------------------
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 100
  set.seed(2024)
  sim_data = generateData(nsub = nsub, nobs = rep(10, nsub), p = 1000,
                          beta0 = c(rep(1,7),rep(0,993)), rho = 0.6, correlation = "exchangeable")
  y = sim_data$y
  x = sim_data$X
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  tau_vec = c(0.1,0.5,0.9)
  for (correlation in correlation_vec){
    for(tau in tau_vec){
      x_sis_ind = SIS::SIS(x,y,nsis=200)$sis.ix0
      xsis = x[,x_sis_ind]
      QPGEE_results = qpgee(xsis, y, nobs=rep(10, nsub), correlation = correlation,
                            tau = tau, max_it=100, method = "HBIC",ncore = 10)
      full_beta = rep(0,1000)
      full_beta[x_sis_ind] = QPGEE_results$beta
      QPGEE_results$beta = full_beta
      res_list[[correlation]][[as.character(tau)]][[m]] = QPGEE_results
    }
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 6;
#Note: users can update parameters for other working correlation structures
compile_result(res_list[["exchangeable"]][[as.character(0.9)]],
               beta0= c(rep(1,7),rep(0,993)))
toc()
tic()
# Section 4.3 -------------------------------------------------------------
## Computational Time Comparison Script, Table 7 --------------------------

# Note that we only showcase p = 200 here.
# For p = 2000, just change p from 200 to 2000.
p = 200
# Initialize some vectors to store computation time
# No palatalization is used for all three methods tested
qpgee_tvec = c()
pgee_own_tvec = c()
pgee_pkg_tvec = c()

# start simulation
n_sim = 1
for (i in 1:n_sim) {
  #generate data
  nsub = 100
  nobs = rep(10, nsub)
  id = rep(1:nsub,each = nobs)
  sim_data = generateData(nsub= nsub, nobs = rep(10, nsub),  p = p,
                          beta0 = c(rep(1,7),rep(0,193)), rho = 0.6, correlation = "exchangeable")
  y = sim_data$y
  x = sim_data$X

  #fit qpgee
  qpgee_time = system.time(qpgee_fit <-
                             qpgee(x,y,lambda = 0.1, tau=0.5,nobs=nobs,cutoff = 10^-3))[3]
  qpgee_tvec = c(qpgee_tvec,qpgee_time)

  #fit own pgee
  data = data.frame(x,y,id)
  pgee_own_time = system.time(PGEE_own_fit <- PGEE("y ~.-id-1",id = id,data = data,
                                                   corstr = "exchangeable",lambda=0.1))[3]
  pgee_own_tvec = c(pgee_own_tvec,pgee_own_time)

  #fit pgee from PGEE package
  pgee_pkg_time = system.time(PGEE_pkg <- PGEE::PGEE("y ~.-id-1",id = id,data = data,
                                                     corstr = "exchangeable",lambda=0.1))[3]
  pgee_pkg_tvec = c(pgee_pkg_tvec,pgee_pkg_time)

}
#a tool function to report performance
report_sum <- function(x){
  c(mean(x),min(x),max(x),median(x))
}
#report results for table 7
report_sum(qpgee_tvec)
report_sum(pgee_own_tvec)
report_sum(pgee_pkg_tvec)
toc()
tic()
# Section 5.1 -------------------------------------------------------------
## Monte Carlo Simulation Script for Table 8 ---------------------------

# load data
data("simuGene")
set.seed(2024)
sim_data <- generateData(nsub = 1000, nobs = rep(5,1000), p = 50,
                         beta0 = c(rep(1,9),rep(0,41)), rho = 0.6, correlation = "exchangeable",
                         ka = 0.5, SNPs = simuGene[,1:25])
y = sim_data$y
x = sim_data$X

PQGEE_results_median <- qpgee(x, y, nobs=rep(5,1000), correlation="exchangeable",
                              tau=0.5, intercept= FALSE, method="HBIC", cutoff=10^-4,ncore = 10)
compile_result(PQGEE_results_median,
               beta0= c(rep(1,9),rep(0,41)))


# This subsection includes codes for Table 8, p = 50
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 1000
  nobs = rep(5,1000)
  p = 50
  sim_data <- generateData(nsub = nsub, nobs = nobs, p = p,
                           beta0 = c(rep(1,9),rep(0,41)), rho = 0.6, correlation = "exchangeable",
                           ka = 0.5, SNPs = simuGene[,1:25])
  y = sim_data$y
  x = sim_data$X
  #for PGEE, reorganize the data
  id = rep(1:nsub, times = nobs)
  data = data.frame(x,y,id)
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  #correlation_vec = c("exchangeable","AR1")
  for (correlation in correlation_vec){
    # qpgee tau = 0.5
    tau = 0.5
    QPGEE_results = qpgee(x, y, nobs=nobs, correlation = correlation,
                          tau = tau, max_it=100, method = "HBIC",ncore = 18)
    res_list[[correlation]][["qpgee"]][[m]] = QPGEE_results
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 8;
#Note: users can update parameters for other working correlation structures
compile_result(res_list[["exchangeable"]][["qpgee"]],
               beta0= c(rep(1,9),rep(0,41)))
toc()
tic()
# This subsection includes codes for Table 8, p = 1000
n_sim = 1 # number of simulation replications
res_list = NULL # initialize a list to store all the fitted models
# start simulation
for (m in 1:n_sim){
  #generate data
  nsub = 1000
  nobs = rep(5,1000)
  p = 1000
  sim_data <- generateData(nsub = nsub, nobs = nobs, p = p,
                           beta0 = c(rep(1,9),rep(0,991)), rho = 0.6, correlation = "exchangeable",
                           ka = 0.5, SNPs = simuGene)
  y = sim_data$y
  x = sim_data$X

  #SIS
  x_sis_ind = SIS::SIS(x,y,nsis=200)$sis.ix0
  xsis = x[,x_sis_ind]

  #for PGEE, reorganize the data
  id = rep(1:nsub, times = nobs)
  data = data.frame(xsis,y,id)
  # fit qpgee on different correlation structure for different tau
  correlation_vec = c("independence","exchangeable","AR1")
  for (correlation in correlation_vec){
    # fit qpgee with tau = 0.5
    tau = 0.5

    QPGEE_results = qpgee(xsis, y, nobs=nobs, correlation = correlation,
                          tau = tau, max_it=100, method = "HBIC",ncore = 10)
    # map screened beta back to full length
    full_beta = rep(0,p)
    full_beta[x_sis_ind] = QPGEE_results$beta
    QPGEE_results$beta = full_beta
    res_list[[correlation]][["qpgee"]][[m]] = QPGEE_results
  }
}
#Report results for independence working correlation structure with tau = 0.5 from Table 8;
#Note: users can update parameters for other working correlation structures
compile_result(res_list[["exchangeable"]][["qpgee"]],
               beta0= c(rep(1,9),rep(0,991)))

toc()
tic()
# Section 5.2 -------------------------------------------------------------

## Gene Expression Example: Yeast Cell G1 ---------------------------------
# Load data, the yeastG1 data is provided in geeVerse package
data("yeastG1")
n_obs=4
n_sub=283
x=as.matrix (yeastG1[,c(3:99)])
y=as.matrix(yeastG1$y)
predictors=c("intercept",names(yeastG1[,c(3:99)]))

# obtain beta initial from a cross-sectional non-penalized quantile model.
library(quantreg)
betaint = coefficients(rq( y ~ x, tau = 0.9))
# qpgee with HBIC tuning
PQGEE_results <- qpgee(x, y, nobs = rep(n_obs,n_sub), betaint = betaint,
                       correlation= "unstructured", tau=0.9, intercept=TRUE, method="HBIC", cutoff=10^-4)
# report selected variables
predictors[abs(PQGEE_results$beta)>10^-3]

# PGEE with CV selected penalty level
CVfit_result <- CVfit("y ~ . -id ", id = id, data = yeastG1,
                      lambda.vec = exp(seq(log(10),log(0.1),length.out = 30)), fold = 5)
myfit1 <- PGEE("y ~ . - id ", id = id, data = yeastG1,
               lambda = CVfit_result$lam.opt)
# report selected variables
pgee_index <- which(abs(myfit1$coefficient) > 10^-3)
predictors[pgee_index]

toc()
# Section 5.3 -------------------------------------------------------------
# As CCI-779 data is not publicly available, we are not able to provide the script here.

Try the geeVerse package in your browser

Any scripts or data that you put into this service are public.

geeVerse documentation built on Aug. 21, 2025, 5:56 p.m.