# ----------
# Minor Functions
# ----------
setdirs = function(work_dir){
# Make directories
verbose = TRUE
if( verbose ) message(sprintf("%s: Make directories ...\n",date()),appendLF = FALSE)
proj_dir = file.path(work_dir,"sim_ROKET"); smart_mkdir(proj_dir)
sim_dir = file.path(proj_dir,"sim"); smart_mkdir(sim_dir)
reps_dir = file.path(sim_dir,"REPS"); smart_mkdir(reps_dir)
rout_dir = file.path(sim_dir,"REG"); smart_mkdir(rout_dir)
sout_dir = file.path(sim_dir,"OUT"); smart_mkdir(sout_dir)
# Output
if( verbose ) message(sprintf("%s: Output ...\n",date()),appendLF = FALSE)
list(proj_dir = proj_dir,sim_dir = sim_dir,reps_dir = reps_dir,
rout_dir = rout_dir,sout_dir = sout_dir)
}
# ----------
# Optimal transport simulation
# ----------
OT_sim = function(){
message("Add simulation code eventually",appendLF = FALSE)
}
# ----------
# Simulation functions
# ----------
kOT_sim_geneInfo = function(nGENE,nPATH,path_corr,prop_noise){
# Group genes into pathways
# Code if genes are exclusive to certain paths
all_PATHS = seq(nPATH)
PATH_prob = exp(seq(nPATH,1)*0.1)
PATH_prob = PATH_prob / sum(PATH_prob)
gene_names = paste0("G",seq(nGENE))
gdat = smart_df(GENE = gene_names,
PATH = sample(all_PATHS,nGENE,replace = TRUE,prob = PATH_prob))
dim(gdat); gdat[1:5,]
table(gdat$PATH)
# Simulate gene similarities based on underlying pathways (range between 0 and 1)
gene_sim = matrix(NA,nGENE,nGENE)
gene_sim = smart_names(gene_sim,ROW = gene_names,COL = gene_names)
# diag(gene_sim) = 1
for(pp1 in seq(nPATH)){
for(pp2 in seq(nPATH)){
# pp1 = 1; pp2 = 2
# Only fill in the upper triangle
PATH1 = all_PATHS[pp1]; PATH1
PATH2 = all_PATHS[pp2]; PATH2
if( PATH1 > PATH2 ) next
idx1 = which(gdat$PATH == PATH1)
idx2 = which(gdat$PATH == PATH2)
ngenes1 = length(idx1); ngenes1
ngenes2 = length(idx2); ngenes2
if( min(c(PATH1,PATH2)) > 0 ){
if( PATH1 == PATH2 ){
sim_range = sort(runif(2,path_corr,1))
gene_sim[idx1,idx2] = runif(ngenes1*ngenes2,sim_range[1],sim_range[2])
} else {
# Pairs of genes not in same pathway
sim_range = sort(runif(2,0,1-path_corr))
gene_sim[idx1,idx2] = runif(ngenes1*ngenes2,sim_range[1],sim_range[2])
gene_sim[idx2,idx1] = runif(ngenes1*ngenes2,sim_range[1],sim_range[2])
}
}
rm(idx1,idx2)
}}
diag(gene_sim) = 1
gene_sim[lower.tri(gene_sim)] = t(gene_sim)[lower.tri(gene_sim)]
# any(is.na(gene_sim))
# Add some noise
gs_noise = matrix(runif(nGENE^2,0,1),nGENE,nGENE)
gs_noise[lower.tri(gs_noise)] = t(gs_noise)[lower.tri(gs_noise)]
diag(gs_noise) = 1
# prop_noise = 0.35
gene_sim2 = (1 - prop_noise) * gene_sim + prop_noise * gs_noise
return(list(gene_sim = gene_sim2,gdat = gdat))
}
kOT_sim_baseCov = function(subj_names){
verbose = TRUE
if( verbose ) message(sprintf("%s: Simulate baseline covariates ...\n",date()),appendLF = FALSE)
NN = length(subj_names)
XX = matrix(data = 1,nrow = NN,
ncol = 1,dimnames = list(seq(NN),"Int"))
XX = cbind(XX,make_dummy(x = sample(c("M","F"),NN,replace = TRUE)))
XX = cbind(XX,make_dummy(x = sample(c("I","II","III","IV"),NN,replace = TRUE)))
XX = cbind(XX,age = scale(sample(seq(20,80),NN,replace = TRUE))[,1])
rownames(XX) = subj_names
XX = as.matrix(XX)
return(XX)
}
kOT_sim_sPATH = function(subj_names,path_names,SCEN){
verbose = TRUE
if( verbose ) message(sprintf("%s: Simulate individual pathways mutated ...\n",date()),appendLF = FALSE)
NN = length(subj_names)
nPATH = length(path_names)
vec_path = seq(nPATH)
sPATH = matrix(NA,NN,nPATH,dimnames = list(subj_names,path_names))
if( SCEN %in% c(1,2) ){
# Approx constant PB mass
PB = sample(seq(9,11),NN,replace = TRUE)
} else if( SCEN %in% c(3,4) ){
# Variable PB mass
PB = sample(seq(5,20),NN,replace = TRUE)
}
PB[PB >= nPATH] = nPATH
PB[PB <= 5] = 5
for(ii in seq(NN)){
# ii = 1
subj_path_set = c(rep(0,nPATH - PB[ii]),rep(1,PB[ii]))
# Shuffle paths
sPATH[ii,] = sample(subj_path_set)
}
dim(sPATH); sPATH[1:5,]
return(sPATH)
}
kOT_sim_sMUT = function(sPATH,geneInfo,SCEN){
verbose = TRUE
if( verbose ) message(sprintf("%s: Simulate individual genes mutated ...\n",date()),appendLF = FALSE)
NN = nrow(sPATH)
gene_names = rownames(geneInfo$gene_sim)
nGENE = length(gene_names)
sMUT = matrix(0,nGENE,NN)
sMUT = smart_names(MAT = sMUT,
ROW = gene_names,
COL = rownames(sPATH))
for(ii in seq(NN)){
# ii = 1
# Determine all pathways mutated and sample from those genes
# sPATH[ii,]
subj_paths = which(sPATH[ii,] == 1)
subj_paths = as.integer(gsub("P","",names(subj_paths)))
subj_paths
tmp_df = smart_df(PATH = subj_paths)
prob_paths = rep(1,length(subj_paths))
prob_paths = prob_paths / sum(prob_paths)
tmp_df$PROB = prob_paths
# Sample from individual's mutated and null paths
if( SCEN %in% c(1,3) ){
# TMB is about 2 or 3 times PMB
subj_TMB = round(length(subj_paths) * 3)
} else if( SCEN %in% c(2,4) ){
# TMB is varied multiple of PMB
subj_TMB = sample(seq(length(subj_paths) + 5,8e1,2),1)
}
samp_paths = sample(subj_paths,subj_TMB,
replace = TRUE,prob = prob_paths)
subj_tab = table(samp_paths); subj_tab
nsubj_paths = length(subj_tab)
tmp_df$CNT = sapply(tmp_df$PATH,function(xx)
sum(samp_paths == xx))
if( min(tmp_df$CNT) == 0 ){
tmp_df$CNT[tmp_df$CNT == 0] = 1
}
tmp_df
subj_genes = c()
for(jj in seq(nrow(tmp_df))){
# jj = 1
tmp_genes = geneInfo$gdat$GENE[which(geneInfo$gdat$PATH == tmp_df$PATH[jj])]
subj_genes = c(subj_genes,sample(tmp_genes,tmp_df$CNT[jj],replace = TRUE))
rm(tmp_genes)
}
subj_genes = unique(subj_genes)
sMUT[subj_genes,ii] = 1
rm(tmp_df,subj_paths,subj_genes,samp_paths,subj_tab)
}
return(sMUT)
}
kOT_sim_GEN = function(geneInfo,NN,SCEN){
nGENE = nrow(geneInfo$gene_sim)
PATHs = sort(unique(geneInfo$gdat$PATH[geneInfo$gdat$PATH > 0])); PATHs
subj_names = paste0("S",seq(NN))
gene_names = paste0("G",seq(nGENE))
path_names = paste0("P",PATHs)
# Simulate pathways mutated
sPATH = kOT_sim_sPATH(subj_names = subj_names,
path_names = path_names,SCEN = SCEN)
# Simulate genes mutated
sMUT = kOT_sim_sMUT(sPATH = sPATH,
geneInfo = geneInfo,SCEN = SCEN)
# Output
return(list(sMUT = sMUT,sPATH = sPATH))
}
kOT_sim_pXX = function(sPATH,pBETA,maxWAY){
verbose = TRUE
if( verbose ) message(sprintf("%s: Construct pathway design matrix ...\n",date()),appendLF = FALSE)
nPATH = ncol(sPATH)
pXX = c()
aa = 1
for(ii in seq(maxWAY)){
# ii = 2
path_subsets = combn(x = seq(nPATH),m = ii)
bb = aa + ncol(path_subsets) - 1; aa; bb
idx_zero = which(pBETA[seq(aa,bb)] != 0)
path_subsets = path_subsets[,idx_zero,drop = FALSE]
# dim(path_subsets)
# length(idx_zero)
for(jj in seq(ncol(path_subsets))){
# jj = 1
path_group = path_subsets[,jj]; path_group
tmp_mat = sPATH[,path_group,drop = FALSE]
pXX = cbind(pXX,apply(tmp_mat,1,prod))
colnames(pXX)[ncol(pXX)] = paste(colnames(tmp_mat),
collapse = ".")
}
aa = bb + 1
# message(dim(pXX),appendLF = FALSE); pXX[1:4,]
}
dim(pXX); pXX[1:3,]
return(pXX)
}
kOT_sim_hZZ = function(sPATH,pXX,TYPE,pBETA){
NN = nrow(sPATH)
nPATH = ncol(sPATH)
path_names = colnames(sPATH)
if( TYPE == "CAT" ){
hZZ = c(pXX %*% pBETA[colnames(pXX)])
} else {
stop("No code yet")
}
return(hZZ)
}
kOT_sim_inputs = function(NN,nGENE,nPATH,path_corr,
prop_noise,maxWAY,pBETA_sig){
# Make COST
geneInfo = kOT_sim_geneInfo(nGENE = nGENE,
nPATH = nPATH,path_corr = path_corr,
prop_noise = prop_noise)
# Sim pBETA
pBETA = c()
fin_names = c()
prop_nz = 0.5; # prop 25 pathways with non-zero 1-way effect
prop_nz_inc = 0.5 # % of n-way effects are non-zero
for(ii in seq(maxWAY)){ # maximum-way interactions
# num_coef = choose(n = nPATH,k = ii)
tmp_combs = combn(x = nPATH,m = ii)
tmp_names = apply(tmp_combs,2,function(xx)
paste(paste0("P",xx),collapse = "."))
num_coef = ncol(tmp_combs)
# Ensure prop_nz of 1-way, 2-way, etc interactions have non-zero effects
tmp_pBETA = rnorm(n = num_coef,mean = 0,sd = pBETA_sig)
binBETA = sample(c(0,1),num_coef,
prob = c(1 - prop_nz,prop_nz),
replace = TRUE)
tmp_pBETA = tmp_pBETA * binBETA
pBETA = c(pBETA,tmp_pBETA)
fin_names = c(fin_names,tmp_names)
prop_nz = prop_nz * prop_nz_inc
}
pBETA = round(pBETA,3)
names(pBETA) = fin_names
# Simulate baseline covariates (gender, tumor stage, age)
subj_names = paste0("S",seq(NN))
XX = kOT_sim_baseCov(subj_names = subj_names)
xBETA = c(-5,0.5,seq(3)/3,-1)
names(xBETA) = colnames(XX)
# Output
inputs = list(nGENE = nGENE,nPATH = nPATH,
path_corr = path_corr,prop_noise = prop_noise,
geneInfo = geneInfo,pBETA = pBETA,XX = XX,
xBETA = xBETA)
return(inputs)
}
#' @title kOT_sim_make
#' @description Generates simulation files
#' @param work_dir A full path to create "sim_ROKET" and subdirectories
#' @param NN A positive integer for sample size
#' @param nGENE A positive integer for number of genes to simulate
#' @param nPATH A positive integer for number of pathways to simulate
#' @param RR A positive integer for number of replicates to simulate
#' @return Nothing. Rds files are created within the simulation ROKET directory.
#' @export
kOT_sim_make = function(work_dir,NN = 200,
nGENE = 500,nPATH = 25,RR = 200){
my_dirs = setdirs(work_dir = work_dir)
if( NN != round(NN) && NN > 0 )
stop("NN isn't a positive integer")
if( nGENE != round(nGENE) && nGENE > 0 )
stop("nGENE isn't a positive integer")
if( nPATH != round(nPATH) && nPATH > 0 )
stop("nPATH isn't a positive integer")
if( RR != round(RR) && RR > 0 )
stop("RR isn't a positive integer")
# Simulation parameters
maxWAY = 4 # maximum number of n-way interactions
path_corr = 0.50 # lower bound on correlation among genes of same pathway
prop_noise = 0.50 # adds static noise to correlation between genes
pBETA_sig = 0.40 # increase variance on pathway coefficients
# Gene relationships, covariates
inputs_fn = file.path(my_dirs$sim_dir,
sprintf("inputs_nS.%s_nG.%s_nP.%s.rds",
NN,nGENE,nPATH))
if( !file.exists(inputs_fn) ){
inputs = kOT_sim_inputs(NN = NN,nGENE = nGENE,
nPATH = nPATH,path_corr = path_corr,
prop_noise = prop_noise,maxWAY = maxWAY,
pBETA_sig = pBETA_sig)
saveRDS(inputs,inputs_fn)
}
inputs = readRDS(inputs_fn)
# Get sim results for a SCENARIO and original replicate
hBETAs = seq(0,1,0.1)
prob_events = c(0.25,0.5,0.75,1)
for(SCEN in seq(4)){
# SCEN = 1
message(sprintf("%s: SCEN = %s ...\n",date(),SCEN),appendLF = FALSE)
# Import scenario data
scen_fn = file.path(my_dirs$reps_dir,
sprintf("sim_SCEN%s.rds",SCEN))
if( !file.exists(scen_fn) ){
scen = kOT_sim_GEN(geneInfo = inputs$geneInfo,
NN = NN,SCEN = SCEN)
names(scen)
saveRDS(scen,scen_fn)
}
scen = readRDS(scen_fn)
pXX = kOT_sim_pXX(sPATH = scen$sPATH,
pBETA = inputs$pBETA,maxWAY = maxWAY)
for(rr in seq(RR)){
smart_progress(ii = rr,nn = RR,iter = 5,iter2 = 1e2)
out_fn = file.path(my_dirs$reps_dir,
sprintf("sim_SCEN%s_rr%s.rds",SCEN,rr))
if( file.exists(out_fn) ) next
OUT = list()
for(hBETA in hBETAs){
# hBETA = hBETAs[1]
# Simulate h(ZZ)
if( hBETA == 0 ){
hZZ = rep(0,NN)
} else {
hZZ = kOT_sim_hZZ(sPATH = scen$sPATH,pXX = pXX,
TYPE = "CAT",pBETA = inputs$pBETA * hBETA)
}
# Expected mean
regMU = c(inputs$XX %*% inputs$xBETA + hZZ )
# Linear
YY = regMU + rnorm(n = NN,0,sd = 1.5)
# Survival
RHO = 2.0
UU = runif(n = NN,0,1)
TT = ( -log(UU) / exp(regMU) )^(1/RHO)
SD = smart_df(TT = TT)
for(prob_event in prob_events){
if( prob_event < 1 ){
max_delta = max(TT); max_delta
while(TRUE){
# max_delta
CC = runif(n = NN,0,max_delta)
prop_delta = mean(TT <= CC); # message(prop_delta,appendLF = FALSE)
prop_diff = prop_delta - prob_event; # prop_diff
if( abs(prop_diff) < 0.02 ){
break
} else if( prop_diff > 0 ){
max_delta = max_delta * 0.9
} else if( prop_diff < 0 ){
max_delta = max_delta * 1.1
}
}
} else {
CC = rep(max(TT) + 1,NN)
}
tmp_SD = smart_df(SS = pmin(TT,CC),DD = 1*(TT <= CC))
prob_event2 = round(prob_event * 100)
names(tmp_SD) = paste0(names(tmp_SD),"_",prob_event2)
SD = cbind(SD,tmp_SD); rm(tmp_SD)
}
tmp_name = sprintf("hBETA_%s",hBETA)
OUT[[tmp_name]] = smart_df(hZZ = hZZ,YY = YY,SD)
rm(hZZ,regMU,YY,SD,UU,TT,CC)
}
saveRDS(OUT,out_fn)
rm(OUT)
}
rm(scen)
}
return(NULL)
}
# ----------
# Simulation Analysis functions
# ----------
#' @title kOT_sim_OT
#' @param work_dir A full path to create "sim_ROKET" and subdirectories
#' @param NN A positive integer for sample size
#' @param nGENE A positive integer for number of genes to simulate
#' @param nPATH A positive integer for number of pathways to simulate
#' @param SCEN An integer taking values 1, 2, 3, or 4
#' @param ncores A positive integer specifying the number of
#' cores/threads to use for optimal transport calculations
#' @return Nothing. Rds files are created within the simulation ROKET directory.
#' @export
kOT_sim_OT = function(work_dir,NN,nGENE,nPATH,SCEN,ncores = 1){
my_dirs = setdirs(work_dir = work_dir)
# Import geneInfo
inputs_fn = file.path(my_dirs$sim_dir,
sprintf("inputs_nS.%s_nG.%s_nP.%s.rds",
NN,nGENE,nPATH))
inputs = readRDS(inputs_fn)
# Import scenario's genomics
scen_fn = file.path(my_dirs$reps_dir,
sprintf("sim_SCEN%s.rds",SCEN))
scen = readRDS(scen_fn)
# OT params
EPS = 1e-3
LAMs = c(0.5,1,5,Inf)
conv = 1e-5
max_iter = 3e3
show_iter = 10
# Run NN by NN OT
OT_fn = file.path(my_dirs$reps_dir,
sprintf("OT_SCEN%s.rds",SCEN))
if( !file.exists(OT_fn) ){
OT = list()
for(LAM in LAMs){
# LAM = LAMs[1]
message(sprintf("%s: LAM = %s ...\n",date(),LAM),appendLF = FALSE)
tmp_name = sprintf("LAM.%s",LAM)
tmp_OT_fn = file.path(my_dirs$reps_dir,
sprintf("tmp_OT_SCEN%s_%s.rds",SCEN,tmp_name))
if( !file.exists(tmp_OT_fn) ){
COST = 1 - inputs$geneInfo$gene_sim
sMUT = scen$sMUT; sMUT = sMUT[rowSums(sMUT) >= 1,]
int_genes = intersect(rownames(COST),rownames(sMUT))
COST = COST[int_genes,int_genes]
sMUT = sMUT[int_genes,]
dim(COST); dim(sMUT)
LAM2 = LAM
BAL = FALSE
if( is.infinite(LAM) ){
LAM2 = 1
BAL = TRUE
}
out = run_myOTs(ZZ = sMUT,COST = COST,EPS = EPS,
LAMBDA1 = LAM2,LAMBDA2 = LAM2,balance = BAL,
conv = conv,max_iter = max_iter,ncores = ncores,
verbose = FALSE,show_iter = show_iter)
saveRDS(out,tmp_OT_fn)
} else {
out = readRDS(tmp_OT_fn)
}
OT[[tmp_name]] = out
}
saveRDS(OT,OT_fn)
}
tmp_fns = list.files(my_dirs$reps_dir)
tmp_fns = tmp_fns[grepl(sprintf("tmp_OT_SCEN%s_",SCEN),tmp_fns)]
if( length(tmp_fns) > 0 )
unlink(file.path(my_dirs$reps_dir,tmp_fns))
return(NULL)
}
kOT_sim_GENE = function(sim,out = "OLS",hBETAs = NULL,nPERM,samp_thres){
num_basecov = ncol(sim$XX); num_basecov
vec_genes = rownames(sim$sMUT)
num_genes = nrow(sim$sMUT); num_genes
if( is.null(hBETAs) ){
hBETAs = names(sim$OUT); hBETAs
hBETAs = hBETAs[as.numeric(gsub("hBETA_","",hBETAs)) >= 0]; hBETAs
}
OUTs = out
if( out == "SURV" ){
OUTs = names(sim$OUT$hBETA_0)
OUTs = OUTs[grepl("SS_",OUTs)]
OUTs = gsub("SS_","",OUTs)
OUTs
}
# GMs = c(0,5,10,20,30)
log10_TMB = log10(colSums(sim$sMUT))
XX = smart_df(cbind(sim$XX[,-1],log10_TMB))
NN = nrow(XX)
res = c()
nom_PVAL = matrix(NA,num_genes,length(OUTs) *length(hBETAs))
rownames(nom_PVAL) = vec_genes
colnames(nom_PVAL) = sprintf("C%s",seq(ncol(nom_PVAL)))
iter = 1
for(OUT in OUTs){
for(hBETA in hBETAs){
# OUT = OUTs[1]; hBETA = hBETAs[1]
message(sprintf("%s: OUT = %s; hBETA = %s ...\n",date(),OUT,hBETA),appendLF = FALSE)
colnames(nom_PVAL)[iter] = sprintf("%s_%s",OUT,hBETA)
if( out == "SURV" ){
tmp_surv = sim$OUT[[hBETA]]
SS = tmp_surv[,paste0("SS_",OUT)]
DD = tmp_surv[,paste0("DD_",OUT)]
}
aPVAL = matrix(NA,nPERM + 1,num_genes)
colnames(aPVAL) = vec_genes
idx_SUBJ = matrix(NA,nPERM + 1,NN)
idx_SUBJ[1,] = seq(NN)
idx_SUBJ[-1,] = t(sapply(seq(nPERM),function(xx) sample(NN)))
cnt = 1
for(gene in vec_genes){
# gene = vec_genes[79]; gene
smart_progress(ii = cnt,nn = num_genes,iter = 5,iter2 = 1e2)
tab1 = table(sim$sMUT[gene,]); tab1
if( length(tab1) != 2 || min(tab1) < samp_thres ){
aPVAL[,gene] = 1
cnt = cnt + 1
next
}
for(PERM in seq(nrow(aPVAL))){
# PERM = 1
if( out == "OLS" ){
lm_out = lm(sim$OUT[[hBETA]]$YY ~
sim$sMUT[gene,idx_SUBJ[PERM,]] + .,data = XX)
aPVAL[PERM,gene] = summary(lm_out)$coefficients[2,4]
}
if( out == "SURV" ){
cx_out = tryCatch(coxph(Surv(SS,DD) ~
sim$sMUT[gene,idx_SUBJ[PERM,]] + .,data = XX),
warning = function(ww){NULL},error = function(ee){NULL})
if( is.null(cx_out) ){
aPVAL[PERM,gene] = 1
next
}
aPVAL[PERM,gene] = summary(cx_out)$coefficients[1,5]
}
}
cnt = cnt + 1
}
keep_genes = colnames(aPVAL)[aPVAL[1,] != 1]; length(keep_genes)
if( length(keep_genes) == 0 ){
PVAL_perm = 1
} else {
vec_mPVAL = apply(aPVAL[,keep_genes,drop = FALSE],1,min)
PVAL_perm = mean(vec_mPVAL <= vec_mPVAL[1]); PVAL_perm
}
hBETA_2 = as.numeric(gsub("hBETA_","",hBETA)); hBETA_2
OUT2 = OUT
if( out == "SURV" ) OUT2 = paste0("SURV_nCENS.",OUT2)
OUT2
res = rbind(res,smart_df(OUT = OUT2,hBETA = hBETA_2,
PVAL_perm = PVAL_perm,nTEST_genes = length(keep_genes)))
nom_PVAL[,iter] = aPVAL[1,]
iter = iter + 1
}}
message(res,appendLF = FALSE)
return(list(RES = res,nom_PVAL = nom_PVAL))
}
kOT_sim_KERN = function(sim,OT,nPERM,hBETAs = NULL){
OT_EUC = OT
EUC = as.matrix(dist(t(sim$sMUT)))
all_out = c("OLS","SURV")
all_LABS = c("EUC","OT")
if( is.null(hBETAs) ){
hBETAs = names(sim$OUT); hBETAs
}
# Set null model covariates
log10_TMB = log10(colSums(sim$sMUT))
XX = cbind(sim$XX[,-1],log10_TMB)
NN = nrow(XX)
KK = array(data = NA,dim = c(NN,NN,length(OT_EUC) + 1))
samp_names = sprintf("S%s",seq(NN))
dist_names = c("EUC",names(OT_EUC))
dimnames(KK) = list(samp_names,samp_names,dist_names)
for(vv in dist_names){
if( vv == "EUC" ){
dd = EUC
} else {
dd = OT_EUC[[vv]]$DIST
}
KK[,,vv] = D2K(D = dd)
rm(dd)
}
OMNI = matrix(0,nrow = 2,ncol = dim(KK)[3],
dimnames = list(all_LABS,dimnames(KK)[[3]]))
OMNI[1,1] = 1; OMNI[2,-1] = 1
reg_out = c()
for(out in all_out){
for(hBETA in hBETAs){
message(sprintf("%s: OUT = %s; hBETA = %s ...\n",date(),out,hBETA),appendLF = FALSE)
hBETA_2 = as.numeric(gsub("hBETA_","",hBETA))
if( out == "OLS" ){
# Run OLS regression with MiRKAT
for(LAB in all_LABS){
KK_tmp = list()
tmp_LABS = dimnames(KK)[[3]]
if( LAB == "EUC" ) tmp_LABS = tmp_LABS[grepl("EUC",tmp_LABS)]
if( LAB == "OT" ) tmp_LABS = tmp_LABS[!grepl("EUC",tmp_LABS)]
for(vv in tmp_LABS){
KK_tmp[[vv]] = KK[,,vv]
}
fit = MiRKAT(y = sim$OUT[[hBETA]]$YY,X = XX,
Ks = KK_tmp,out_type = "C",method = "permutation",
omnibus = "permutation",nperm = nPERM,
returnKRV = FALSE,returnR2 = FALSE)
fit
if( LAB == "EUC" ){
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = LAB,PVAL_perm = as.numeric(fit$p_values)))
} else {
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = "omni_OT",PVAL_perm = fit$omnibus_p))
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = names(fit$p_values),
PVAL_perm = as.numeric(fit$p_values)))
}
}
# Run OLS regression with ROKET
lm_out = lm(sim$OUT[[hBETA]]$YY ~ .,data = smart_df(XX))
RESI = as.numeric(lm_out$residuals)
names(RESI) = samp_names
fit2 = kernTEST(RESI = RESI,KK = KK,
OMNI = OMNI,nPERMS = nPERM); rm(RESI)
fit2
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = "EUC",
PVAL_perm = as.numeric(fit2$omni_PVALs["EUC"])))
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = "omni_OT",
PVAL_perm = as.numeric(fit2$omni_PVALs["OT"])))
reg_out = rbind(reg_out,smart_df(OUT = out,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = names(fit2$PVALs),
PVAL_perm = as.numeric(fit2$PVALs)))
}
if( out == "SURV" ){
OUTs = names(sim$OUT[[hBETA]])
OUTs = OUTs[grepl("SS_",OUTs)]
OUTs = gsub("SS_","",OUTs)
OUTs
for(OUT in OUTs){
# OUT = OUTs[1]
OUT2 = paste0("SURV_nCENS.",OUT)
tmp_surv = sim$OUT[[hBETA]]
SS = tmp_surv[,paste0("SS_",OUT)]
DD = tmp_surv[,paste0("DD_",OUT)]
# Run SURV regression with MiRKATS
for(LAB in all_LABS){
KK_tmp = list()
tmp_LABS = dimnames(KK)[[3]]
if( LAB == "EUC" ) tmp_LABS = tmp_LABS[grepl("EUC",tmp_LABS)]
if( LAB == "OT" ) tmp_LABS = tmp_LABS[!grepl("EUC",tmp_LABS)]
for(vv in tmp_LABS){
KK_tmp[[vv]] = KK[,,vv]
}
fit = suppressWarnings(MiRKATS(obstime = SS,delta = DD,X = XX,
Ks = KK_tmp,perm = TRUE,omnibus = "permutation",nperm = nPERM))
fit
if( LAB == "EUC" ){
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = LAB,
PVAL_perm = as.numeric(fit$p_values)))
} else {
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = "omni_OT",
PVAL_perm = fit$omnibus_p))
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "MiRKAT",LAB = names(fit$p_values),
PVAL_perm = as.numeric(fit$p_values)))
}
}
# Run SURV regression with ROKET
cout = coxph(Surv(SS,DD) ~ .,data = smart_df(XX))
RESI = as.numeric(cout$residuals)
names(RESI) = samp_names
fit2 = kernTEST(RESI = RESI,KK = KK,
OMNI = OMNI,nPERMS = nPERM); rm(RESI)
names(fit2$PVALs) = names(KK)
fit2
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = "EUC",
PVAL_perm = as.numeric(fit2$omni_PVALs["EUC"])))
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = "omni_OT",
PVAL_perm = as.numeric(fit2$omni_PVALs["OT"])))
reg_out = rbind(reg_out,smart_df(OUT = OUT2,hBETA = hBETA_2,
SOFT = "Rcpp",LAB = names(fit2$PVALs)[-1],
PVAL_perm = as.numeric(fit2$PVALs[-1])))
}
}
}}
message(reg_out,appendLF = FALSE)
return(reg_out)
}
#' @title kOT_sim_REG
#' @param work_dir A full path to create "sim_ROKET" and subdirectories
#' @param NN A positive integer for sample size
#' @param nGENE A positive integer for number of genes to simulate
#' @param nPATH A positive integer for number of pathways to simulate
#' @param SCEN An integer taking values 1, 2, 3, or 4
#' @param rr A positive integer indexing a replicate
#' @return Nothing. A rds file is created within the simulation ROKET directory.
#' @export
kOT_sim_REG = function(work_dir,NN,nGENE,nPATH,SCEN,rr){
my_dirs = setdirs(work_dir = work_dir)
inputs_fn = file.path(my_dirs$sim_dir,
sprintf("inputs_nS.%s_nG.%s_nP.%s.rds",
NN,nGENE,nPATH))
inputs = readRDS(inputs_fn)
rds_fn = file.path(my_dirs$rout_dir,
sprintf("sim_rr_%s_SCEN%s.rds",rr,SCEN))
if( file.exists(rds_fn) ) unlink(rds_fn)
# Import mutations
sim_fn = file.path(my_dirs$reps_dir,
sprintf("sim_SCEN%s.rds",SCEN))
sim = readRDS(sim_fn)
sim$XX = inputs$XX
# Import OT
OT_fn = file.path(my_dirs$reps_dir,
sprintf("OT_SCEN%s.rds",SCEN))
if( !file.exists(OT_fn) ) stop(sprintf("OT %s not done!",SCEN))
OT = readRDS(OT_fn)
# Import outcomes
out_fn = file.path(my_dirs$reps_dir,
sprintf("sim_SCEN%s_rr%s.rds",SCEN,rr))
if( !file.exists(out_fn) )
stop(sprintf("Outcomes from SCEN %s rep %s missing",SCEN,rr))
sim$OUT = readRDS(out_fn)
# Run gene regressions
reg_gene = c()
OUTs = c("OLS","SURV")
nom_PVAL = list()
for(out in OUTs){
# out = OUTs[1]
tmp_out = kOT_sim_GENE(sim = sim,
out = out,hBETAs = NULL,nPERM = 1e2,
samp_thres = 20)
reg_gene = rbind(reg_gene,tmp_out$RES)
nom_PVAL[[out]] = tmp_out$nom_PVAL
}
# Run kernel regressions
kern_rds_fn = file.path(my_dirs$rout_dir,
sprintf("sim_rr_%s_SCEN%s_kern.rds",rr,SCEN))
if( !file.exists(kern_rds_fn) ){
reg_kern = kOT_sim_KERN(sim = sim,OT = OT,
nPERM = 2e3,hBETAs = NULL)
saveRDS(reg_kern,kern_rds_fn)
}
reg_kern = readRDS(kern_rds_fn)
saveRDS(list(GENE = reg_gene,
nom_PVAL = nom_PVAL,KERN = reg_kern),rds_fn)
return(NULL)
}
#' @title kOT_sim_AGG
#' @param work_dir A full path to create "sim_ROKET" and subdirectories
#' @return Nothing. Png files are created within the simulation ROKET directory.
#' @export
kOT_sim_AGG = function(work_dir){
my_dirs = setdirs(work_dir = work_dir)
res_fn = file.path(my_dirs$sout_dir,"res.rds")
ures_fn = file.path(my_dirs$sout_dir,"ures.rds")
# Aggregate replicates
if( !file.exists(ures_fn) ){
RR = list.files(my_dirs$rout_dir,pattern = "sim")
RR = RR[grepl("_rr_",RR)]
RR = RR[grepl(".rds$",RR)]
RR = RR[!grepl("kern",RR)]
RR = RR[grepl("SCEN1",RR)]
RR = length(RR); RR
res = c()
fin_nom_PVAL = list()
for(SCEN in seq(4)){
# SCEN = 1
message(sprintf("%s: SCEN = %s ...\n",date(),SCEN),appendLF = FALSE)
for(rr in seq(RR)){
# rr = 1
smart_progress(ii = rr,nn = RR,iter2 = 1e2)
rr_fn = file.path(my_dirs$rout_dir,
sprintf("sim_rr_%s_SCEN%s.rds",rr,SCEN))
if( !file.exists(rr_fn) ){
message(sprintf("%s miss ",rr),appendLF = FALSE)
next
}
tmp_rds = readRDS(rr_fn)
if( rr == 1 ){
nom_PVAL = tmp_rds$nom_PVAL
nom_PVAL$OLS = ifelse(nom_PVAL$OLS < 0.05,1,0)
nom_PVAL$SURV = ifelse(nom_PVAL$SURV < 0.05,1,0)
} else {
nom_PVAL$OLS = nom_PVAL$OLS + ifelse(tmp_rds$nom_PVAL$OLS < 0.05,1,0)
nom_PVAL$SURV = nom_PVAL$SURV + ifelse(tmp_rds$nom_PVAL$SURV < 0.05,1,0)
}
tmp_gene = tmp_rds$GENE
tmp_gene$RR = rr
tmp_gene$SCEN = SCEN
res_gene = c()
for(VAR in c("PVAL_perm")){
tmp_df = tmp_gene
tmp_df$THRES = tmp_df[,VAR]
VAR2 = VAR
if( VAR == "PVAL_perm" ) VAR2 = "minP_gene_perm"
tmp_df$DIST = VAR2
tmp_df$SOFT = "none"
tmp_df = tmp_df[,c("RR","OUT","DIST","hBETA",
"SOFT","SCEN","THRES","nTEST_genes")]
res_gene = rbind(res_gene,tmp_df); rm(tmp_df)
}
tmp_kern = tmp_rds$KERN[which(tmp_rds$KERN$hBETA >= 0),]
tmp_kern$RR = rr
tmp_kern$SCEN = SCEN
tmp_kern$THRES = tmp_kern$PVAL_perm
tmp_kern$DIST = tmp_kern$LAB
tmp_kern$nTEST_genes = nrow(tmp_rds$nom_PVAL$OLS)
tmp_kern = tmp_kern[,c("RR","OUT","DIST","hBETA",
"SOFT","SCEN","THRES","nTEST_genes")]
tmp_kern[1:5,]
res = rbind(res,res_gene,tmp_kern)
rm(tmp_rds,tmp_gene,tmp_kern)
}
nom_PVAL$OLS = nom_PVAL$OLS / RR
nom_PVAL$SURV = nom_PVAL$SURV / RR
fin_nom_PVAL[[sprintf("SCEN%s",SCEN)]] = list(nom_PVAL = nom_PVAL)
rm(nom_PVAL)
}
saveRDS(list(RES = res,
fin_nom_PVAL = fin_nom_PVAL),
res_fn)
# Calculate hypothesis rejection
rds = readRDS(res_fn)
res = rds$RES
ures = unique(res[,c("OUT","DIST","SOFT","SCEN","hBETA")])
ures$REJECT = NA
dim(ures); ures[1:5,]
tot = nrow(ures); tot
message(sprintf("%s: Calculate %s rejections ...\n",date(),tot),appendLF = FALSE)
for(ii in seq(tot)){
# ii = 1
smart_progress(ii = ii,nn = tot,iter2 = 2e2)
idx = which(res$OUT == ures$OUT[ii]
& res$DIST == ures$DIST[ii]
& res$SOFT == ures$SOFT[ii]
& res$SCEN == ures$SCEN[ii]
& res$hBETA == ures$hBETA[ii])
length(idx)
ures$REJECT[ii] = mean(res$THRES[idx] < 0.05)
rm(idx)
}
mDIST = c("minP_gene_perm")
tmp_df = ures[which(ures$DIST %in% mDIST),]
tmp_df$SOFT = "Rcpp"
ures$SOFT[ures$DIST %in% mDIST] = "MiRKAT"
ures = rbind(ures,tmp_df); rm(tmp_df)
ures$SOFT[grepl("Rcpp",ures$SOFT)] = "ROKET"
saveRDS(ures,ures_fn)
}
# Polish
ures = readRDS(ures_fn)
ures[1:3,]
smart_table(ures$DIST)
tmp_lev = sort(unique(ures$DIST)); tmp_lev
tmp_lev = tmp_lev[c(6,1,5,4,3,2,7)]; tmp_lev
ures$DIST2 = factor(ures$DIST,levels = tmp_lev,
labels = c("Gene-Based (Perm)","Euclidean",
# sprintf("OT (\u03BB = %s)",c("\u221E","5.0","1.0","0.5")),
"OT (Balanced)",
sprintf("OT (\u03BB = %s)",c("5.0","1.0","0.5")),
"OT (omnibus)"))
ures$SCEN2 = factor(ures$SCEN,
levels = sort(unique(ures$SCEN)),
labels = paste0("Scenario ",sort(unique(ures$SCEN))))
# dim(ures); ures[1:3,]
# Plot for paper
hBETA2 = DIST2 = NULL
if( TRUE ){
ures2 = ures[which(ures$SOFT == "ROKET" & ures$hBETA >= 0
& ures$OUT %in% c("OLS",sprintf("SURV_nCENS.%s",c(25,50,75,100)))
),]
# dim(ures2); ures2[1:5,]
lev_OUT = sort(unique(ures2$OUT)); lev_OUT
all_OUT = c("OLS",sprintf("SURV_nCENS.%s",c(100,75,50,25))); all_OUT
lev_OUT = all_OUT[which(all_OUT %in% lev_OUT)]; lev_OUT
ures2$OUT = factor(ures2$OUT,levels = lev_OUT,
labels = c("OLS",sprintf("CENS = %s%%",c(0,25,50,75))))
# table(ures2$OUT)
# ures2$DIST
my_theme = theme(legend.position = c("none","bottom","right")[2],
text = element_text(size = 45),
axis.text.x = element_text(size = 30),
# axis.text.y = element_text(size = 42),
# strip.text.y = element_text(size = 18),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey50",
size = 1,linetype = "dotted"),
panel.spacing.x = unit(4,"lines"),
panel.spacing.y = unit(2,"lines"),
legend.key.width = unit(4,"line"),
# legend.title = element_text(size = 34),
# legend.text = element_text(size = 34),
plot.title = element_text(hjust = 0.5))##,size = 38))
gg = ggplot(data = ures2,
mapping = aes(x = hBETA,y = REJECT,color = DIST2)) +
facet_grid(SCEN2 ~ OUT) + geom_line(size = 3) +
geom_hline(aes(yintercept = 0.05),
size = 2,linetype = 2,color = "black") +
ylim(c(0,1)) +
xlab(expression(gamma)) + ylab("Power / Type I Error") +
labs(color = "Approach") + my_theme
# gg
png_fn = file.path(my_dirs$sout_dir,"final_sim.png")
ggsave(png_fn,plot = gg,device = "png",
width = 25,height = 25,units = "in",dpi = 75)
rm(gg)
}
if( TRUE ){ # Check distribution of gene power
rds = readRDS(res_fn)
names(rds)
nPVAL = rds$fin_nom_PVAL
names(nPVAL)
wPVAL = c("nom_PVAL")
out = c()
for(SCEN in seq(4)){
# SCEN = 1
SCEN2 = sprintf("SCEN%s",SCEN)
for(OUT in names(nPVAL[[SCEN2]][[wPVAL]])){
# OUT = c("OLS","SURV")[2]
cnames = colnames(nPVAL[[SCEN2]][[wPVAL]][[OUT]])
for(VAR in cnames){
if( OUT == "OLS" ){
hBETA = as.numeric(gsub("OLS_hBETA_","",VAR))
OUT2 = OUT
} else if( OUT == "SURV" ){
nCENS = as.integer(strsplit(VAR,"_")[[1]][1])
OUT2 = sprintf("SURV_nCENS%s",nCENS)
hBETA = as.numeric(strsplit(VAR,"_")[[1]][3])
}
REJECT = nPVAL[[SCEN2]][[wPVAL]][[OUT]][,VAR]
REJECT = as.numeric(REJECT[REJECT > 0])
out = rbind(out,smart_df(SCEN = SCEN,
OUT = OUT2,hBETA = hBETA,REJECT = REJECT))
rm(REJECT)
}}}
out$SCEN2 = sprintf("Scenario = %s",out$SCEN)
tmp_lev = sort(unique(out$OUT)); tmp_lev
tmp_lev = tmp_lev[c(1,2,5,4,3)]; tmp_lev
out$OUT2 = factor(out$OUT,levels = tmp_lev,
labels = c("OLS",sprintf("CENS = %s%%",c(0,25,50,75))))
out$hBETA2 = factor(out$hBETA,levels = sort(unique(out$hBETA)),
labels = smart_digits(sort(unique(out$hBETA)),digits = 1))
dim(out); out[1:20,]
my_theme = theme(legend.position = c("none","bottom","right")[1],
text = element_text(size = 45),
axis.text.x = element_text(size = 30),
# axis.text.y = element_text(size = 42),
# strip.text.y = element_text(size = 18),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey50",
size = 1,linetype = "dotted"),
panel.spacing.x = unit(4,"lines"),
panel.spacing.y = unit(2,"lines"),
legend.key.width = unit(2,"line"),
# legend.title = element_text(size = 34),
# legend.text = element_text(size = 34),
plot.title = element_text(hjust = 0.5))
gg = ggplot(data = out,mapping = aes(x = hBETA2,y = REJECT)) +
geom_boxplot(fill = "#56B4E9") + facet_grid(SCEN2 ~ OUT2) +
geom_hline(aes(yintercept = 0.05),size = 2,linetype = 2,color = "black") +
xlab(expression(gamma)) + ylab("Power / Type I Error") +
my_theme
png_fn = file.path(my_dirs$sout_dir,
sprintf("final_sim_%s.png",wPVAL))
ggsave(png_fn,plot = gg,device = "png",
width = 45,height = 25,units = "in",dpi = 75)
rm(gg)
}
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.